《戰(zhàn)狼Ⅱ》破50億了,你還不知道它在說(shuō)啥?本文通過(guò)Python爬蟲抓取獲取12萬(wàn)條影評(píng)分析,告訴你《戰(zhàn)狼Ⅱ》用什么撩到了你。
import
requests import re import pandas as pd url_first='https://movie.douban.com/subject/26363254/comments?start=0' head={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Ubuntu Chromium/59.0.3071.109 Chrome/59.0.3071.109 Safari/537.36'} html=requests.get(url_first,headers=head,cookies=cookies) cookies={'cookie':'你自己的cookie'} #也就是找到你的賬號(hào)對(duì)應(yīng)的cookie reg=re.compile(r'.*?.*?(.*?).*?(.*?).*?title="(.*?)">.*?title="(.*?)">.*?class=""> (.*?)\n',re.S) #評(píng)論等內(nèi)容 while html.status_code==200: url_next='https://movie.douban.com/subject/26363254/comments'+re.findall(reg,html.text)[0] zhanlang=re.findall(ren,html.text) data=pd.DataFrame(zhanlang) data.to_csv('/home/wajuejiprince/文檔/zhanlang/zhanlangpinglun.csv', header=False,index=False,mode='a+') #寫入csv文件,'a+'是追加模式 data=[] zhanlang=[] html=requests.get(url_next,cookies=cookies,headers=head)
library(data.table)
library(plotly)
library(stringr)
library(jiebaR)
library(wordcloud2)
library(magrittr)
dt<-fread(file.choose()) #導(dǎo)入數(shù)據(jù)
dt[,c("V8","V9","V10","V11","V12","V13"):=NULL] #刪除空列
#一條命令清洗數(shù)據(jù)
my_dt<-dt[str_detect(贊成評(píng)論數(shù),"\\d+")][評(píng)論有用=='有用'][是否看過(guò)=="看過(guò)"][五星數(shù)%in%c("很差","較差","還行","推薦","力薦")]
wk <- worker()
sw<-function(x){wk<=x}
segwords<-lapply(my_dt[,評(píng)論內(nèi)容],sw)
my_segwords<-unlist(segwords) #不要列表
#去除停止詞
st<-readLines(file.choose()) #讀取停止詞
stopwords<-c(NULL)
for(i in 1:length(st))
{
stopwords[i]<-st[i]
}
seg_Words<-filter_segment(my_segwords,stopwords) #去除中文停止詞
words<-table(seg_Words)%>%data.table()
setnames(words,"N","pinshu")
words[pinshu>1000] #去除較低頻數(shù)的詞匯(小于1000的)
wordcloud2(words[pinshu>1000], size = 2, fontFamily = "微軟雅黑",color = "random-light", backgroundColor = "grey")
由于數(shù)據(jù)太多,導(dǎo)致我的破電腦卡頓,所以在制作云圖的時(shí)候去掉了頻數(shù)低于1000的詞匯。
Python交流群
635448130點(diǎn)擊加入群聊UI設(shè)計(jì)交流群
579150876點(diǎn)擊加入群聊Unity交流群
495609038點(diǎn)擊加入群聊HTML5交流群
645591648點(diǎn)擊加入群聊