Python爬取並簡單分析知乎用戶粉絲構成

09-18

來自專欄 Antenna的python學習筆記

原作者：Charles的皮卡丘（微信公眾號）

一、項目目的

利用Python爬取並簡單分析知乎用戶粉絲構成。

二、環境

Python3.6

pyecharts、requests、jieba、fake_useragent

三、主要思路

許多爬過知乎的人應該都知道，知乎有三個常用的API介面，如下：

# 獲取某個知乎用戶詳細信息的API介面user_url = https://www.zhihu.com/api/v4/members/{user}?include={include}# 獲取某個知乎用戶關注了用戶信息的API介面followees_url = https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}# 獲取某個知乎用戶關注者用戶信息的API介面followers_url = https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}

這裡只使用了第三個介面來獲取指定用戶的粉絲信息，信息包括用戶的昵稱、標題、性別、回答問題數量等，然後使用pyecharts等庫對數據進行簡單的可視化分析。

四、源碼

# 知乎用戶粉絲信息爬蟲import osimport jsonimport timeimport pickleimport requestsfrom fake_useragent import UserAgentua = UserAgent()# 知乎用戶粉絲信息爬蟲class zhihu(): def __init__(self): self.headers = { user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36, authorization: oauth c3cef7c66a1843f8b3a9e6a1e3160e20 } self.include = include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={}&limit=20 self.followers_url = https://www.zhihu.com/api/v4/members/{user}/followers?{include} # 外部調用 def get(self, user, savepath=./results, savename=data.pkl): data = {} i = -1 flag = True while flag: i += 1 print([INFO]:Start to get data in page %d... % (i+1)) include = self.include.format(i*20) res = requests.get(self.followers_url.format(user=user, include=include), headers=self.headers, timeout=30) self.headers[user-agent] = ua.random res_json = json.loads(res.text) followers_data = res_json[data] for follower_data in followers_data: # 昵稱 name = self.__read_data_from_dict(follower_data, name) # 性別(1=男, 0=女, -1=未知) gender = self.__read_data_from_dict(follower_data, gender) # 標題 headline = self.__read_data_from_dict(follower_data, headline) # 回答數量 answer_count = self.__read_data_from_dict(follower_data, answer_count) # url_token url_token = self.__read_data_from_dict(follower_data, url_token) # 該用戶的粉絲數量 follower_count = self.__read_data_from_dict(follower_data, follower_count) # 整合數據 while name in data: name += 0 data[name] = [gender, headline, answer_count, follower_count, url_token] if res_json[paging][is_end]: flag = False print([INFO]: Start to save data...) self.__save_data(data, savepath, savename) print([INFO]: All done...) # 創建會話 def __create_session(self): session = requests.Session() adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=3) session.mount(https://, adapter) session.mount(http://, adapter) return session # 從字典dict中根據key值讀取數據 def __read_data_from_dict(self, dictionary, key): value = dictionary.get(key) return value if value is not None else unknow # 保存爬取到的數據 def __save_data(self, data, savepath, savename): if not os.path.exists(savepath): os.mkdir(savepath) with open(os.path.join(savepath, savename), wb) as f: pickle.dump(data, f)if __name__ == __main__: zhihu().get(user=li-li-78-92-18, savename=charles_pikachu.pkl)################################################################################ 簡單分析import osimport jiebaimport picklefrom pyecharts import Barfrom pyecharts import Piefrom pyecharts import Scatterfrom pyecharts import HeatMapfrom pyecharts import WordCloud# 詞雲def DrawWordCloud(title, data, savepath=./results, width=2500, height=1300, word_size_range=[10, 15]): if not os.path.exists(savepath): os.mkdir(savepath) wc = WordCloud(width=width, height=height) attrs = [data[i][0] for i in range(len(data))] values = [data[i][1] for i in range(len(data))] wc.add(, attrs, values, word_size_range=word_size_range) wc.render(os.path.join(savepath, %s.html % title))# 餅圖def DrawPie(title, data, savepath=./results): if not os.path.exists(savepath): os.mkdir(savepath) pie = Pie(title) attrs = [data[i][0] for i in range(len(data))] values = [data[i][1] for i in range(len(data))] pie.add(, attrs, values, is_label_show=True) pie.render(os.path.join(savepath, %s.html % title))# 柱狀圖(2維)def DrawBar(title, data, savepath=./results): if not os.path.exists(savepath): os.mkdir(savepath) bar = Bar(title) attrs = [data[i][0] for i in range(len(data))] values = [data[i][1] for i in range(len(data))] bar.add(, attrs, values, mark_point=["min", "max"]) bar.render(os.path.join(savepath, %s.html % title))# 散點圖def DrawScatter(title, data, savepath=./results): if not os.path.exists(savepath): os.mkdir(savepath) scatter = Scatter(title) attrs = [data[i][0] for i in range(len(data))] values = [data[i][1] for i in range(len(data))] scatter.add(, attrs, values, is_visualmap=True) scatter.render(os.path.join(savepath, %s.html % title))# 熱力圖def DrawHeatMap(title, data, savepath=./results): if not os.path.exists(savepath): os.mkdir(savepath) heatmap = HeatMap(title) x_axis = [data[i][0] for i in range(len(data))] y_axis = [data[i][1] for i in range(len(data))] values = [[i, i, data[i][2]] for i in range(len(data))] heatmap.add(, x_axis, y_axis, values, is_visualmap=True, visual_text_color="#000", visual_orient="horizontal") heatmap.render(os.path.join(savepath, %s.html % title))# 統計詞頻def statistics(texts, stopwords): words_dict = {} for text in texts: temp = jieba.cut(text) for t in temp: if t in stopwords or t == unknow: continue if t in words_dict.keys(): words_dict[t] += 1 else: words_dict[t] = 1 return words_dictif __name__ == __main__: with open(./results/charles_pikachu.pkl, rb) as f: all_data = pickle.load(f) # 男女比 gender_data = [ad[1][0] for ad in all_data.items()] male = gender_data.count(1) female = gender_data.count(0) unknow = gender_data.count(-1) gender_data = [[男性, male], [女性, female], [性別未知, unknow]] DrawPie(title=粉絲男女比例餅圖, data=gender_data, savepath=./results) DrawBar(title=粉絲性別構成柱狀圖, data=gender_data, savepath=./results) # 昵稱詞雲 nickname_data = [] temp = [] for ad in all_data.items(): ad = ad[0].strip() if ad and ad != unknow and ad not in temp: nickname_data.append([ad, 1]) temp.append(ad) DrawWordCloud(粉絲昵稱詞雲, nickname_data, savepath=./results) # 標題詞雲 headlines = [] for ad in all_data.items(): ad = ad[1][1].strip() if ad and ad != unknow: headlines.append(ad) stopwords = open(./stopwords.txt, r, encoding=utf-8).read().split( )[:-1] headline_data = list(statistics(headlines, stopwords).items()) DrawWordCloud(粉絲標題詞雲, headline_data, savepath=./results, width=2000, height=1200, word_size_range=[15, 20]) # 粉絲的粉絲數量統計 follower_count_dict = dict() for ad in all_data.items(): ad = ad[1][3] if ad == unknow or not ad: if 0 not in follower_count_dict: follower_count_dict.setdefault(0, 0) follower_count_dict[0] += 1 else: if ad not in follower_count_dict: follower_count_dict.setdefault(ad, 0) follower_count_dict[ad] += 1 follower_count_data = list(follower_count_dict.items()) DrawScatter(粉絲的粉絲數量散點圖, follower_count_data, savepath=./results) follower_count_100 = 0 follower_count_1000 = 0 follower_count_other = 0 for fcd in follower_count_data: if fcd[0] < 100: follower_count_100 += fcd[1] if fcd[0] < 1000 and fcd[0] >= 100: follower_count_1000 += fcd[1] if fcd[0] >= 1000: follower_count_other += fcd[1] follower_count_data1 = [[100<, 粉絲數量小於100, follower_count_100], [100-1000, 粉絲數量100-1000, follower_count_1000], [>=1000, 粉絲數量大於1000, follower_count_other]] DrawHeatMap(粉絲的粉絲數量熱力圖, follower_count_data1, savepath=./results) # 粉絲的回答數量統計 answer_count_dict = {0: 0, 1-20: 0, 21-40: 0, 41-60: 0, 61-80: 0, 81-100: 0, >100: 0} for ad in all_data.items(): ad = ad[1][2] if ad == unknow or not ad: answer_count_dict[0] += 1 else: if ad <= 20 and ad > 0: answer_count_dict[1-20] += 1 if ad <= 40 and ad > 20: answer_count_dict[21-40] += 1 if ad <= 60 and ad > 40: answer_count_dict[41-60] += 1 if ad <= 80 and ad > 60: answer_count_dict[61-80] += 1 if ad <= 100 and ad > 80: answer_count_dict[81-100] += 1 if ad > 100: answer_count_dict[>100] += 1 answer_count_data = list(answer_count_dict.items()) DrawBar(title=粉絲的回答數量柱狀圖, data=answer_count_data, savepath=./results)

要注意的是，user是下圖的url_token