[爬蟲,pymongo,json]分析ajax請求並抓取今日頭條組圖

02-03

今日頭條搜索「街拍」，爬取搜索結果組圖頁面內所有文章圖片。

安裝mongoDB

homebrew安裝
使用homebrew的方法開啟

brew update 更新brew
brew install mongodb 安裝mongodb資料庫

brew services start mongodb 啟動mongodb資料庫
brew services stop mongodb 停止mongodb資料庫

分析ajax請求並抓取今日頭條組圖保存至mongoDB

網站分析

搜索後出現索引頁，多篇文章聚集的頁面
json格式，使用json.loads解析
格式層次data-0-article——url:XXX 嵌套使用get獲取article_url對應的值
索引頁中獲取的article_url對應文章（詳情頁）

regex獲得gallery對應的內容（json格式）解析json，獲得其中的url（image_url）
搜索後出現索引頁，多篇文章聚集的頁面
json格式，使用json.loads解析
格式層次data-0-article——url:XXX 嵌套使用get獲取article_url對應的值
索引頁中獲取的article_url對應文章（詳情頁）
regex獲得gallery對應的內容（json格式）解析json，獲得其中的url（image_url）

抓取索引頁代碼
獲取索引頁response
解析索引頁，獲得詳情頁url
抓取詳情頁代碼
獲取詳情頁response
解析詳情頁，獲得title及images_url
BeautifulSoup環境配置+lxml安裝

#安裝lxml: brew install libxml2nbrew install libxsltnbrew link libxml2 --forcenbrew link libxslt --forcenpip install lxmlnpip install scrapyn

下載圖片並保存數據

mongoDB資料庫使用

mongoDB初始化

#創建mongoDB參數，新建mongoDB_config.py，輸入： nMONGO_URL = localhost nMONGO_DB = TouTial nMONGO_TABLE = JiePai_img n#回到main.py初始化mongoDB對象 nclient = pymongo.MongoClient(MONGO_URL) nDB = client[MONGO_DB]n

循環及多線程
每一步最好都添加異常處理，避免無法運行

源代碼

#config.pynMONGO_URL = localhostnMONGO_DB = TouTialnMONGO_TABLE = JiePai_imgnn#main.pynfrom mongoDB_config import *nfrom bs4 import BeautifulSoupnfrom hashlib import md5nfrom urllib.parse import urlencodenfrom requests.exceptions import RequestExceptionnfrom multiprocessing import Poolnimport jsonnimport osnimport requestsnimport renimport pymongonn#初始化mongoDB資料庫對象nclient = pymongo.MongoClient(MONGO_URL) #創建mongDBclientnDB = client[MONGO_DB]nn#獲取索引頁代碼ndef get_page_index(offsetInput,keyword):n QueryPara = {n offset: offsetInput,n format: json,n keyword: keyword,n autoload: true,n count: 20,n cur_tab: 3n }n url = https://www.toutiao.com/search_content/? + urlencode(QueryPara)n response = requests.get(url)n try:n if response.status_code ==200:n return response.textn return Nonen except RequestException:n print(請求索引頁出錯)n return Nonenn#解析indexndef parse_page_index(html):n data = json.loads(html) #json格式轉換為python數據格式n #print(data)n if data and data in data.keys(): #data為true 且 data在data的keys中（true）n for item in data.get(data): #獲取data中data鍵的值（該值為字典形式，用遍歷獲取） html >>data >>item(0/1/2...) >>article_urln yield item.get(article_url) #獲取data>>0、1、2...>>article_url的值nn#獲取具體頁面ndef get_page_img(article_url):n response = requests.get(article_url)n try:n if response.status_code == 200:n return response.textn return Nonen except RequestException:n print(請求詳情頁出錯)n return Nonenn#解析詳情頁ndef parse_page_img(html_img,article_url):n BSoup = BeautifulSoup(html_img,lxml)n title = BSoup.select(title)[0].get_text() #第一個（[0]）title標籤的內容獲取，頁面名稱n print(title)n #regex獲取圖片urln pattern = re.compile(BASE_DATA.galleryInfo.*?gallery.*?({.*?}),s*?siblingList,re.S)#regex重寫n result = re.search(pattern,html_img)#獲取gallery的內容，是一個json文本n if result:n #print(result.group(1))n data = json.loads(result.group(1)) # json >> phthon格式n if data and sub_images in data.keys():n sub_imges = data.get(sub_images) #提取sub_images鍵的值，為一個listn #print(sub_imges)n images = [item.get(url) for item in sub_imges] #遍歷提取每個item中url鍵對應的值，即想要的圖片鏈接n #調用img下載函數n for image in images:n download_img(image)n n return {n title:title,n url:article_url,n images:imagesn }nn#保存數據至mongoDBndef save_to_mongoDB(result):n if DB[MONGO_TABLE].insert(result):n return Truen print(存儲成功,result)n return Falsenn#下載圖片ndef download_img(url):n response = requests.get(url)nn try:n if response.status_code ==200:n save_img(response.content)n print(正在下載,url)n return Nonen except RequestException:n print(下載圖片出錯)n return Nonenn#將圖片保存ndef save_img(content):n file_path = {0}/{1}.{2}.format(os.getcwd(),md5(content).hexdigest(),jpg)n if not os.path.exists(file_path):#若文件不存在n with open(file_path,wb) as f:n f.write(content)n f.close()nndef main(offset):n keyword = 街拍n html = get_page_index(offset,keyword)n #print(parse_page_index(html))n for article_url in parse_page_index(html):n #print(article_url)n html_img = get_page_img(article_url)n if html_img:n result = parse_page_img(html_img,article_url)n #print(result)n if result:n save_to_mongoDB(result)nnif __name__ == __main__:n #main() #下載一個索引頁的圖片n keyword = 街拍nn #循環遍歷多個索引頁-單線程n for x in range(0,20):n offset = x*20n main(offset,keyword)nn #循環下載-多線程n groups = [x*20 for x in range(0,20)]n pool = Pool()n pool.map(main,groups)n ```n