Python爬蟲入門—分析Ajax爬取今日頭條美圖

02-02

聲明：

代碼是反覆看視頻，最後理解敲下來，不是原創。來源：Python3爬蟲三大案例實戰分享 - 網易雲課堂感謝崔大神@靜覓

目的：

學習交流。我也是剛開始學習python(2017.09.08開始)，想通過寫文章促進學習~

覺得不錯的點個贊，有問題歡迎留言交流~

學習資料：

MongoDB 極簡實踐入門、
Windows 平台安裝 MongoDB、AJAX 教程 | 菜鳥教程
Python 小結（一）（2017.9.8-2017.10.3）
Python小結（二）(2017.10.4-2017.10.15)

新知識：

md5的使用方法；
多進程pool.map（）；
pymongo、MongoDB和urlencode的使用

結果圖：

資料庫

爬取流程：

1.環境配置

Anaconda+Mongodb(MongoDB for GIANT Ideas)、Robo 3T(可視化Robo 3T）+

基本庫:

requests、BeatutifulSoup-Python網路爬蟲與信息提取_中國大學MOOC(慕課)、
json 從零開始的 JSON 庫教程、
re 正則表達式 - 教程 | 菜鳥教程、正則表達式30分鐘入門教程 - deerchao - 博客園
pymongo pymongo官方文檔

2.了解Ajax

AJAX 是一種在無需重新載入整個網頁的情況下，能夠更新部分網頁的技術。

AJAX 教程 | 菜鳥教程

3.爬取思路

a.抓取和分析索引頁

抓取索引頁

索引頁

def get_page_first(offest,keyword):#抓取索引頁n #如何獲得data:F12-network-F5-XHR-Headers-Query string n data={n offset:offest,#offest可變n format:json,n keyword:keyword,#keyword是可以自定義 n autoload:true,n count:20,n cur_tab:3n } n #urlencode可以把字典對象變成url的請求參數（from urllib.parse import urlencode）n url=http://www.toutiao.com/search_content/?+urlencode(data) n try:n response=requests.get(url) #請求urln if response.status_code==200:n return response.textn return Nonen except RequestException:n print(請求異常)n return Nonen

分析索引頁

def parse_page_first(html):n data=json.loads(html) #轉換成json對象n if data and data in data.keys():#data這個對象非空並且這個對象里有叫data的keyn for item in data.get(data):n yield item.get(article_url) #構造一個生成器，把所有的article_url解析出來n

b.抓取詳情頁的內容

詳情頁

抓取詳情頁

def get_page_detai(url):#詳情頁urln try:n response=requests.get(url) #請求urln if response.status_code==200:n return response.textn return Nonen except RequestException:n print(請求異常)n return Nonen

分析詳情頁

#第一步，獲取每一組圖片的總鏈接(包含各種信息：titleimage...)n#第二步，獲取每組圖中每張圖片的鏈接(只需要圖片鏈接)ndef parse_page_chirld(htmlchirld,url):#詳情頁url以及HTMLn soup=BeautifulSoup(htmlchirld,lxml)n title=soup.select(title)[0].get_text()n print(title)n images_pattern=re.compile(gallery: (.*?),n,re.S)n result = re.search(images_pattern, htmlchirld)n if result:#判斷是否成功n data = json.loads(result.group(1)) # 對字元串進行解析，把字元串轉化成json對象n if data and sub_images in data.keys(): # 判斷裡面是否含有我們想要的數據n sub_images = data.get(sub_images)n images_url=[item.get(url) for item in sub_images]n for image in images_url: download_image(image)n return {n title: title,n url: url,n images_url: images_urln }n#下載圖片ndef download_image(url):n print(正在下載,url)n try:n response=requests.get(url)n if response.status_code==200:n #return response.textn save_image(response.content)#content二進位（圖片是以二進位的方式保存在計算機中）n return Nonen except RequestException:n print(請求圖片出錯,url)n return Nonen#保存圖片ndef save_image(content):n file_path={0}/{1}.{2}.format(D:jiepai,md5(content).hexdigest(),jpg)n if not os.path.exists(file_path):n with open(file_path,wb) as f:n f.write(content)n f.close()n

c.保存到資料庫

新建config.py

# from config import * 將config.py中的所有變數引入nMONGO_URL=localhost#資料庫地址nMONGO_DB=toutiao#資料庫名稱nMONGO_TABLE=toutiao#表格名稱nnGROUP_START=1nGROUP_END=5#循環圈數nKEYWORD=街拍#可以隨便改n

完整代碼

import requestsnfrom urllib.parse import urlencodenimport jsonnfrom hashlib import md5nfrom bs4 import BeautifulSoupnimport renimport osnfrom requests.exceptions import RequestExceptionnimport pymongonfrom config import *n#from multiprocessing import Poolnclient=pymongo.MongoClient(MONGO_URL)ndb=client[MONGO_DB]ndef get_page_first(offest,keyword):#抓取首頁n data={n offset:offest,#offest可變n format:json,n keyword:keyword,#keyword是可以自定義n autoload:true,n count:20,n cur_tab:3n }n url=http://www.toutiao.com/search_content/?+urlencode(data)n try:n response=requests.get(url) #請求urln if response.status_code==200:n return response.textn return Nonen except RequestException:n print(請求異常)n return Nonendef parse_page_first(html):n data=json.loads(html)#轉換成json對象n if data and data in data.keys():n for item in data.get(data):#data這個對象非空並且這個對象里有叫data的keyn yield item.get(article_url)ndef get_page_detai(url):n try:n response=requests.get(url) #請求urln if response.status_code==200:n return response.textn return Nonen except RequestException:n print(請求異常)n return Nonendef parse_page_chirld(htmlchirld,url):n soup=BeautifulSoup(htmlchirld,lxml)n title=soup.select(title)[0].get_text()n print(title)n images_pattern=re.compile(gallery: (.*?),n,re.S)n result = re.search(images_pattern, htmlchirld)n if result:#判斷是否成功n data = json.loads(result.group(1)) # 對字元串進行解析，把字元串轉化成json對象n if data and sub_images in data.keys(): # 判斷裡面是否含有我們想要的數據n sub_images = data.get(sub_images)n images_url=[item.get(url) for item in sub_images]n for image in images_url: download_image(image)n return {n title: title,n url: url,n images_url: images_urln }ndef download_image(url):n print(正在下載,url)n try:n response=requests.get(url)n if response.status_code==200:n #return response.textn save_image(response.content)#content二進位n return Nonen except RequestException:n print(請求圖片出錯,url)n return Nonenndef save_to_mongo(result):n if db[MONGO_TABLE].insert(result):n print(儲存mongodb成功,result)n return Truen return Falsendef save_image(content):n file_path={0}/{1}.{2}.format(D:jiepai,md5(content).hexdigest(),jpg)n if not os.path.exists(file_path):n with open(file_path,wb) as f:n f.write(content)n f.close()ndef main(offset):n html=get_page_first(offset,KEYWORD)n print(html)n for url in parse_page_first(html):n htmlchirld=get_page_detai(url)n if htmlchirld:n result=parse_page_chirld(htmlchirld,url)n if result:save_to_mongo(result)nnif __name__==__main__:n for x in range(GROUP_START,GROUP_END+1):n main(x*20)n