Python 爬取微博並將其可視化

Need Module

  • jieba
  • PIL
  • numpy
  • matplotlib
  • wordcloud
  • BeautifulSoup4
  • selenium

爬取文字部分代碼

from bs4 import BeautifulSoupfrom selenium import webdriverfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesfrom selenium.webdriver.common.action_chains import ActionChainsimport time from selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport osfrom selenium.webdriver.common.keys import Keysclass MSpider(object): """docstring for MSpider""" def __init__(self): super(MSpider, self).__init__() #使用 chorme 加入環境變數 self.__ChromeDriverPath = "C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe" # http 請求頭 self.__SpiderHeader = {"Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "keep-alive", "Host": "m.weibo.cn", "Referer": "https://m.weibo.cn/?&jumpfrom=weibocom", "User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3141.7 Mobile Safari/537.36", "X-Requested-With": "XMLHttpRequest"} # http cooike self.__SpiderCooike = {"_T_WM": "c441c4711a077fac8546fd780b95c19a", "SCF": "AvRIbwdXyJLpLdOW3-ieuamD47vYerurAqfqa0Jz6EcslM6SZoXD_y07ClyiooCMaZY-CgoKmzuWcWf5dBQ90M8.", "H5:PWA:UID": "1", "SUB": "_2A253E77dDeRhGeBO4lUR-S_MzjqIHXVU_8KVrDV6PUJbkdBeLVbwkW1NRY59KjI0-Fh9pXhZoRLEMps0IjFRAJVW", "SUHB": "0A0JUmK4V5jGT2", "M_WEIBOCN_PARAMS": "featurecode%3D20000320%26lfid%3D102803_ctg1_8999_-_ctg1_8999_home%26luicode%3D20000174%26uicode%3D20000174"} # http url地址 , 你先要爬取的微博用戶主頁面URL # 例如 https://m.weibo.cn/u/123456789 self.__SpiderUrl = "balabalabala" # 微博手機版登錄頁面 self.__SpiderLoginInUrl = "https://m.weibo.cn" #賬號密碼 self.__UserName = "你的微博賬號" self.__Password = "你的微博密碼" isSucceed , name = self.InitAndLoginInPage() if(isSucceed): print("Init And Login Succeed --- " + name) #獲取頁中的字元,每次返回一次拉取所獲得的字元 def InitAndLoginInPage(self): #chorme options = webdriver.ChromeOptions() options.add_argument("User-Agent=" + self.__SpiderHeader["User-Agent"]) #chorme 初始化虛擬瀏覽器 driver = webdriver.Chrome(executable_path=self.__ChromeDriverPath , chrome_options = options) #瀏覽器賦值 self.__Driver = driver time.sleep(3) #轉到微博頁面 driver.get(self.__SpiderLoginInUrl) try: WebDriverWait(driver , 10).until(EC.presence_of_element_located((By.ID ,"J-call-app"))) except Exception as e: return False #點擊登錄到登錄界面 toLoginButton = driver.find_element_by_css_selector(".btn.btnWhite") ActionChains(driver).double_click(toLoginButton).perform() try: WebDriverWait(driver , 10).until(EC.presence_of_element_located((By.ID ,"loginName"))) except Exception as e: return False time.sleep(1) #輸入賬號密碼 username = driver.find_element_by_id("loginName") ActionChains(driver).double_click(username).perform() username.send_keys(self.__UserName) password = driver.find_element_by_id("loginPassword") ActionChains(driver).double_click(password).perform() password.send_keys(self.__Password) password.send_keys(Keys.ENTER) try: WebDriverWait(driver , 10).until(EC.presence_of_element_located((By.ID ,"box"))) except Exception as e: return False mName = driver.find_element_by_xpath("//p[@data-node="title"]").text return True , mName def GetWords(self): driver = self.__Driver driver.get(self.__SpiderUrl) words = set() try: WebDriverWait(driver , 10).until(EC.presence_of_element_located((By.ID ,"app"))) except Exception as e: print("To Spider Page Failed") return False time.sleep(1) spiderPageUserName = driver.find_element_by_class_name("txt-shadow").text print("To Spider Page Succeed --- " + spiderPageUserName) js = "window.scrollTo(0, document.body.scrollHeight);" for i in range(80): driver.execute_script(js) time.sleep(1) print("Scroll Number : " + str(i)) bsObj = BeautifulSoup(driver.page_source , "html.parser") containers = bsObj.find_all("div" , {"class":"weibo-og"}) for container in containers: words.add(container.find("div" , {"class":"weibo-text"}).text) return words #關閉瀏覽器 def Close(self): if(self.__Driver is None): return self.__Driver.close()

請求頭和Cooike直接在Chorme中開啟開發者模式,切換成移動端模式,登錄微博之後從瀏覽器上複製下來即可。

這段代碼使用selenium來操作Chorme瀏覽器自動瀏覽微博文章頁面,將頁面滾動到最下方,循環此操作,到達一定次數之後就會顯示該用戶所有的微博,通過BeautifulSoup分析得到了HTML代碼得到微博中該用戶的所有文字,將其保存在一個Set()中以保證每一段話不會發生重複。

注意,在初始化瀏覽器之後必須要等待幾秒鐘,要不然會出現莫名的BUG。

在使用selenium之前需要將chromedriver.exe驅動程序Copy到Chorme程序安裝的根目錄上才可以驅動Chorme瀏覽器。

chromedriver.exe Downloadchromedriver.storage.googleapis.com


爬蟲主程序

from Spider import MSpiderweiboSpider = MSpider()weiboTxt = open("weibo.txt" , "w" , encoding="utf-8")counter = 0for word in weiboSpider.GetWords(): weiboTxt.write(word) counter = counter + 1weiboTxt.close()weiboSpider.Close()print("Finish --- Counter --- " + str(counter))

將爬取用戶的所有文字保存到一個txt文件中,編碼為uft-8,等待下一步將其分析並且可視化。


文字分析+可視化

import jiebafrom collections import Counter from os import pathfrom PIL import Imageimport numpy as npimport matplotlib.pyplot as pltfrom wordcloud import WordCloud, STOPWORDSstopWords = ["!" , "@" , "#" , "¥" , "%" , " " , "&" , "*" , "(" , ")" , "。" , "..." , "," , "/ ", ": " , "- " , "… " , "? " , "你" , "我" , "了" , "人" , "的" , "啊" , "好" , "是" , "也" , "嗎" , "吧"] weiboFile = open("AllWeiboTxt/weibo.txt" , "r" , encoding="utf-8")analyzeString = weiboFile.read()weiboFile.close()words = jieba.cut(analyzeString , cut_all = True)word_freq = {}for word in words: if(word not in stopWords): if(word in word_freq): word_freq[word] += 1 else: word_freq[word] = 1freq_word = []for word, freq in word_freq.items(): freq_word.append((word, freq))freq_word.sort(key = lambda x: x[1] , reverse = True)for word, freq in freq_word[:100]: print(word, freq)alice_mask = np.array(Image.open("Images/Original3.jpg"))wc = WordCloud(font_path = "Font/hanyi.ttf" , background_color="white", max_words=3000, mask=alice_mask , max_font_size=220)wc.generate_from_frequencies(word_freq)wc.to_file("Images/HerWords3.png")

利用jieba來提取txt文件中的詞語並且統計其詞頻,Image/Original3.jpg是你需要填充的原始圖片,WordCloud將根據詞頻來將詞語填充到圖片中,詞語頻率越高,詞語所佔面積越大,font_path是填充詞語所用的字體,因為分析的是中文微博,所以必須使用中文字體,且必須明確指定,默認情況下是無法顯示中文的。


最後,附上作者在代碼中使用的原始圖片


推薦閱讀:

微博粉絲奪寶是否涉及賭博?
微博對你的吸引力在哪裡?能滿足你什麼?有沒有覺得效能在下降?

TAG:Python | 网页爬虫 | 新浪微博 |