爬山要體力爬蟲要腦力

04-28

自成遇到R，我就把日子過得破釜沉舟呵。

持續為學習這種大概率事件下注。不多說，開始爬~

library(rvest)#讀入網址 url<-http://www.imdb.com/search/title?year=2016,2016&title_type=feature&sort=moviemeter,asc> webpage<-read_html(url)#抓取所需內容rank，節點後半部分，節點前加.rank_data_html<-html_nodes(webpage,.text-primary)> rank_data<-html_text(rank_data_html)> rank_data<-as.numeric(rank_data)#抓取所需內容title，節點lister-item-header屬title所在行的行塊</h3>，</a>是title自成塊，節點lister-item-header與塊</a>之間有空格，節點lister-item-header前加.。title_data_html<-html_nodes(webpage,.lister-item-header a)> title_data<-html_text(title_data_html)#抓取所需內容description，節點ratings-bar屬description所在的層塊</div>，節點text-muted是description自成塊，節點前加.。節點ratings-bar與節點text-muted用+連接。description_data_html<-html_nodes(webpage,.ratings-bar+.text-muted)> description_data<-html_text(description_data_html)> description_data<-gsub(" ","",description_data)> head(description_data)[1] "Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th." [2] "In a city of humanoid animals, a hustling theater impresarios attempt to save his theater with a singing competition becomes grander than he anticipates even as its finalists find that their lives will never be the same." [3] "Gloria is an out-of-work party girl forced to leave her life in NY and move back home. When reports surface that a giant creature is destroying Seoul, she gradually comes to the realization that she is somehow connected to this phenomenon."[4] "A true-life drama, centering on British explorer Col. Percival Fawcett, who disappeared while searching for a mysterious city in the Amazon in the 1920s." [5] "The quiet life of a terrier named Max is upended when his owner takes in Duke, a stray whom Max instantly dislikes." [6] "The Rebel Alliance makes a risky move to steal the plans for the Death Star, setting up the epic saga to follow." #抓取所需內容runtime，節點text-muted屬runtime所在行的行塊</p> ，節點runtime屬自成塊</span>，節點text-muted與節點runtime之間用空格連接，節點前加.。 runtime_data_html<-html_nodes(webpage,.text-muted .runtime)> runtime_data<-html_text(runtime_data_html)> runtime_data<-gsub("min","",runtime_data)%>%+ as.numeric(runtime_data)> head(runtime_data)[1] 117 108 109 141 87 133 #抓取所需內容genre，之間抓取自成塊節點genre，因此內容在其所在行的末端。 genre_data_html<-html_nodes(webpage,.genre)> genre_data<-html_text(genre_data_html)#用gsub函數去掉換行符> genre_data<-gsub(" ","",genre_data)> head(genre_data)[1] "Horror, Thriller " "Animation, Comedy, Family " [3] "Action, Comedy, Drama " "Action, Adventure, Biography "[5] "Animation, Adventure, Comedy " "Action, Adventure, Sci-Fi " #去掉空格> genre_data<-gsub(" ","",genre_data)> head(genre_data)[1] "Horror,Thriller" "Animation,Comedy,Family" "Action,Comedy,Drama" [4] "Action,Adventure,Biography" "Animation,Adventure,Comedy" "Action,Adventure,Sci-Fi" #去掉逗號後的字元，.正則表達式里的元字元，表示除換行符外的任意字元，*限定符，表示將其前面內容重複0次或以上。 > genre_data<-gsub(",.*","",genre_data)> head(genre_data)[1] "Horror" "Animation" "Action" "Action" "Animation" "Action" > genre_data<-as.factor(genre_data)> head(genre_data)[1] Horror Animation Action Action Animation Action Levels: Action Adventure Animation Biography Comedy Crime Drama Horror Romance #抓取所需內容rating，節點ratings-imdb-rating屬rating所在行的行塊</div>，strong是其自成塊的節點，節點ratings-imdb-rating與節點strong之間用空格連接。 rating_data_html<-html_nodes(webpage,.ratings-imdb-rating strong)> rating_data<-html_text(rating_data_html)> rating_data<-as.numeric(rating_data) #抓取所需內容votes，節點sort-num_votes-visible屬votes所在行的行塊</p>,</span>是votes自成塊，：表塊構造，nth意思是votes所在行塊的位置標識，child（2）表示votes在行塊的第2位。votes_data_html<-html_nodes(webpage,.sort-num_votes-visible span:nth-child(2))> votes_data<-html_text(votes_data_html)#抓取所需內容directors,節點text-muted屬層塊</p>,因為directors屬文本，所以其層塊</p>同於文本塊，而不同於其它。</a>是directors自成塊，nth表directors所在層塊的位置標識，child（1）表示directors在層塊的第1位。節點text-muted與層塊</p>之間用+和空格連接，層塊</p>與自成塊</a>之間用空格連接。層塊不同行很顯然。directors_data_html<-html_nodes(webpage,.text-muted+ p a:nth-child(1))> directors_data<-html_text(directors_data_html)> directors_data<-as.factor(directors_data)> votes_data<-gsub(",","",votes_data)> votes_data<-as.numeric(votes_data)#抓取所需內容actors，節點lister-item-content屬主塊</div>即所有內容所在塊。節點ghost屬actors所在邏輯塊 </span>,</a>是actors自成塊，節點lister-item-content與節點ghost之間用空格連接，節點ghost與塊</a>之間用+連接。actors_data_html<-html_nodes(webpage,.lister-item-content .ghost+a)> actors_data<-html_text(actors_data_html)> actors_data<-as.factor(actors_data) #抓取所需內容metascore,之間抓取其自成塊的節點metascore，因它無分層無分行。 metascore_data_html<-html_nodes(webpage,.metascore)> metascore_data<-html_text(metascore_data_html)> head(metascore_data)[1] "62 " "59 " "70 " "78 " "61 " "65 "#去掉空格> metascore_data<-gsub(" ","",metascore_data)> head(metascore_data)[1] "62" "59" "70" "78" "61" "65" length(metascore_data)[1] 48#瀏覽數據發現11,13缺少metascore,則for()循環，用NA填充，以便後續構建數據框。> for(i in c(11,13)){+ a<-metascore_data[1:(i-1)]+ b<-metascore_data[i:length(metascore_data)]+ metascore_data<-append(a,list("NA"))+ metascore_data<-append(metascore_data,b)+ }> metascore_data<-as.numeric(metascore_data) length(metascore_data)[1] 50> summary(metascore_data) Min. 1st Qu. Median Mean 3rd Qu. Max. NAs 23.00 49.00 65.00 61.62 71.25 99.00 2 #抓取所需內容gross，節點ghost屬gross所在的邏輯塊</span>,~意思邏輯標識符，節點text-muted是gross所在塊的節點，</span>是gross所在塊，節點ghost與節點text-muted之間有空格，節點text-muted與塊</span>之間用+空格連接。gross_data_html<-html_nodes(webpage,.ghost~ .text-muted+ span)> gross_data<-html_text(gross_data_html)#去掉M已經$> gross_data<-gsub("M","",gross_data) #2表開始位置，6表結束位置gross_data<-substring(gross_data,2,6)> help("substring") length(gross_data)[1] 43#瀏覽網頁發現有缺少值，則for循環，填補NA> for(i in c(11,13,14,23,40,41,47)){+ a<-gross_data[1:(i-1)]+ b<-gross_data[i:length(gross_data)]+ gross_data<-append(a,list("NA"))+ gross_data<-append(gross_data,b)+ }> length(gross_data)[1] 50> gross_data<-as.numeric(gross_data) length(gross_data)[1] 50> summary(gross_data) Min. 1st Qu. Median Mean 3rd Qu. Max. NAs 0.10 19.81 60.31 113.70 154.40 532.10 7#創建數據框movies—df movies_df<-data.frame(rank=rank_data,title=title_data,+ description=description_data,runtime=runtime_data,+ genre=genre_data,rating=rating_data,+ metascore=metascore_data,votes=votes_data,+ Gross_Earning_in_Mil=gross_data,+ directors=directors_data,actors=actors_data)> str(movies_df)data.frame: 50 obs. of 11 variables: $ rank : num 1 2 3 4 5 6 7 8 9 10 ... $ title : Factor w/ 50 levels "A Dark Song",..: 33 31 8 40 43 28 19 14 34 22 ... $ description : Factor w/ 50 levels "A chronicle of the childhood, adolescence and burgeoning adulthood of a young, African-American, gay man growing up in a rough "| __truncated__,..: 45 30 27 17 41 42 12 40 15 31 ... $ runtime : num 117 108 109 141 87 133 128 133 123 107 ... $ genre : Factor w/ 9 levels "Action","Adventure",..: 8 3 1 1 3 1 5 2 1 3 ... $ rating : num 7.4 7.2 6.5 7.1 6.6 7.9 8.3 7.5 6.3 7.7 ... $ metascore : num 62 59 70 78 61 65 93 66 40 81 ... $ votes : num 143795 57064 5938 5752 117606 ... $ Gross_Earning_in_Mil: num 138.1 270.3 2.09 6.64 368.3 ... $ directors : Factor w/ 50 levels "Ana Lily Amirpour",..: 31 11 35 23 10 19 12 15 13 39 ... $ actors : Factor w/ 45 levels "Aaron Poole",..: 23 32 5 12 28 19 37 16 44 6 ...#可視化> library(ggplot2) qplot(data = movies_df,runtime,fill=genre,bins=30)

ggplot(movies_df,aes(x=runtime,y=rating))++ geom_point(aes(size=votes,col=genre))

ggplot(movies_df,aes(x=runtime,y=Gross_Earning_in_Mil))++ geom_point(aes(size=rating,col=genre))

speak：死磕了好幾天，此刻對網頁爬數據以及看源代碼有點感覺了。文中在爬取節點方面的總結，不知背後站著多少摸索與試錯，一次又一次，一遍又一遍，直至有了眉目。哎~不說了，說多了都是汗呵~

還有，可視化部分，欠佳，有待死磕。

爬山要體力 爬蟲要腦力

爬山要體力爬蟲要腦力