用協程和aiohttp爬虎撲步行街前100頁所有帖子

04-30

#!/usr/bin/env python# -*- coding: utf-8 -*-import asyncioimport aiohttpimport timefrom pyquery import PyQuery as pqimport sysimport codecsimport jsonasync def get_post_url(url): 得到每一頁的所有鏈接 async with aiohttp.ClientSession() as client: async with client.get(url) as response: body = await response.text(encoding="utf-8") # print(body) post_list = parser(body) for post_url in post_list: post = {} post["url"] = post_url post_urls.append(post_url)async def get_post_info(url): 根據鏈接得到標題作者發帖時間等內容 async with aiohttp.ClientSession() as client: async with client.get(url) as response: body = await response.text(encoding="utf-8") post_info = paser_post(body) if post_info is not None: post_info["url"] = url post_lists.append(post_info)def paser_post(html): 解析列表頁 post_info = {} doc = pq(html) main_post = doc(div#tpc) post_author = main_post.find(div.author a.u).text() post_time = main_post.find(div.author span.stime).text() post_title = doc(h1#j_data).text() post_info["title"] = post_title post_info[time] = post_time post_info[author] = post_author if not post_title and not post_time and not post_author: return None return post_infodef parser(html): 解析帖子頁 post_list = [] doc = pq(html) links_item = doc(table[id="pl"]).find(tbody).find(tr[mid]) for link_item in links_item.items(): post_link = link_item.find(td.p_title).find(a).attr(href) post_link = "https://bbs.hupu.com" + post_link post_list.append(post_link) return post_list# 得到開始時間start_time = time.time()# 存儲數據的列表post_lists = []# 每一個帖子鏈接的列表post_urls = []# 創建時間循環loop = asyncio.get_event_loop()# 將步行街前一百的鏈接加入事件循環, 同時訪問這100頁, 得到所有的帖子鏈接urls = [ "https://bbs.hupu.com/bxj-postdate-{}".format(i) for i in range(1, 101)]tasks = [get_post_url(url) for url in urls]loop.run_until_complete(asyncio.wait(tasks))# 輸出帖子總數print(len(post_urls))# 將所有帖子鏈接加入事件循環, 得要內容for i in range(0, len(post_urls), 1000): lenth = len(post_urls) - i if lenth >= 1000: lenth = 1000 print(i) end_time = time.time() print("cast time", end_time - start_time) tasks = [get_post_info(post_urls[num + i]) for num in range(lenth)] loop.run_until_complete(asyncio.wait(tasks))loop.close()# 存儲所有數據.post_dicts = {"posts": post_lists, "lenth": len(post_lists)}with codecs.open("post.json", "w", "utf-8") as f: f.write(json.dumps(post_dicts, indent=True))end_time = time.time()print("cast time", end_time - start_time)

共開了1000個協程,

一個ip情況下,用了358秒,也就是接近6分鐘爬完.

每秒能爬32個帖子.

應該是單ip的極限了.