用 rvest
爬資料 將 Mobile01 的 iPhone 板 前 1000 頁的貼文標題抓下來。
參考資料:http://r3dmaotech.blogspot.com/2016/05/r-rvest.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
require ( rvest )
require ( dplyr )
# Mobile 01 website links
links <- paste0 ( "https://www.mobile01.com/topiclist.php?f=383&p=" , 1 : 1000 )
data <- c ()
for ( i in 1 : length ( links )){
url <- links[i]
content <- read_html ( url ) %>% html_nodes ( ".u-ellipsis" ) %>%
html_text ()
temp <- iconv ( content , 'utf8' )
data <- c ( data , temp )
##sleep time
Sys.sleep ( runif ( 1 , 0.5 , 0.8 ))
}
titles <- data [seq ( 1 , length ( data ), by = 3 ) ]
# Omit NAs
titles <- titles[ ! is.na ( titles ) ]
寫好 regex 備用 先將可以抓出標題中各型號 iPhone 的 regex 寫好。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
i6S <- "[iI](PHONE|Phone|phone)? ?(6s|6S)(?! ?[pP])"
i6SPlus <- "[iI](PHONE|Phone|phone)? ?(6s|6S) ?(PLUS|Plus|plus)"
iSE <- "[iI](PHONE|Phone|phone)? ?(SE|se)(?! ?2)"
i7 <- "[iI](PHONE|Phone|phone)? ?7(?! ?[pP])"
i7Plus <- "[iI](PHONE|Phone|phone)? ?7 ?(PLUS|Plus|plus)"
i8 <- "[iI](PHONE|Phone|phone)? ?8(?! ?[pP])"
i8Plus <- "[iI](PHONE|Phone|phone)? ?8 ?(PLUS|Plus|plus)"
iX <- "[iI](PHONE|Phone|phone)? ?[xX](?! ?[sS])(?! ?[rR])"
iXS <- "[iI](PHONE|Phone|phone)? ?(XS|Xs|xs)(?! ?[mM])"
iXSMax <- "[iI](PHONE|Phone|phone)? ?(XS|Xs|xs) ?(MAX|Max|max)"
iXR <- "[iI](PHONE|Phone|phone)? ?(XR|Xr|xr)"
i11 <- "[iI](PHONE|Phone|phone)? ?(11) ?(?! ?[pP])"
i11Pro <- "[iI](PHONE|Phone|phone)? ?(11) ?(PRO|Pro|pro)(?! ?[mM])"
i11ProMax <- "[iI](PHONE|Phone|phone)? ?(11) ?(PRO|Pro|pro) ?(MAX|Max|max)"
i9_SE2 <- "[iI](PHONE|Phone|phone)? ?(9| ?(SE2|Se2|se2))"
iphones <- c ( i6S , i6SPlus , iSE , i7 , i7Plus , i8 , i8Plus , iX , iXS , iXSMax , iXR , i11 , i11Pro , i11ProMax , i9_SE2 )
使用 stringr
找出 iPhone 各型號分別有幾篇討論。
1
2
3
4
5
6
7
library ( stringr )
iphones_in_posts <- c ()
for ( i in seq_along ( iphones )){
iphone_in_posts <- sum ( str_detect ( titles , iphones[i] ))
iphones_in_posts[i] <- iphone_in_posts
}
製作表格 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
library ( tibble )
df <- tibble ( Model = c ( "i6S" , "i6S Plus" , "iPhone SE" , "i7" , "i7 Plus" , "i8" , "i8 Plus" , "iX" , "iXS" , "iXS Max" , "iXR" , "i11" , "i11 Pro" , "i11 Pro Max" , "i9/SE2" ), n = iphones_in_posts )
df <- df %>%
mutate ( Order = 1 : n ()) %>%
arrange ( desc ( n ))
df
## # A tibble: 15 x 3
## Model n Order
## <chr> <int> <int>
## 1 iX 1874 8
## 2 i7 1813 4
## 3 i6S 1442 1
## 4 i8 988 6
## 5 i7 Plus 767 5
## 6 i11 466 12
## 7 i8 Plus 406 7
## 8 i6S Plus 403 2
## 9 iXR 403 11
## 10 iXS Max 385 10
## 11 iXS 336 9
## 12 iPhone SE 334 3
## 13 i11 Pro 182 13
## 14 i11 Pro Max 150 14
## 15 i9/SE2 30 15
Visualization 1
2
3
4
5
6
7
library ( ggplot2 )
ggplot ( df , aes ( x = reorder ( Model , - n ), y = n )) +
geom_bar ( stat = "identity" ,
fill = "#fcba03" ) +
xlab ( "Model" ) +
theme ( axis.text.x = element_text ( angle = 45 , hjust = 1 ))
1
2
3
4
5
6
7
require ( ggrepel )
ggplot ( df , aes ( x = n , y = Order , label = Model )) +
geom_point ( aes ( color = factor ( Model ))) +
theme_bw () +
geom_text_repel () +
theme ( legend.position = "none" )
Explanation 從 Mobile01 爬下的前 1000 頁標題資料中可以發現,文章數最多的
iX、i7,以及 i6S
都已經推出一段時間了,所以累積比較多討論。文章數最少的則是最晚推出的 i11
Pro 和 i11 Pro Max,以及預計今年四月中才會發表的 i9
(SE2)。原本預期會看到許多 i9 (SE2) 的討論,但 Mobile 01
上似乎沒有此現象。i6S Plus 和 iPhone SE
雖然是較早期機種,文章數卻比新機少很多,之後預計會爬內文進一步分析。
Photo by Arnel Hasanovic on Unsplash