一個通用爬蟲思路（Python3）

01-29

也可以來這看一個通用爬蟲思路（Python3）

其實一個爬蟲無非就三步：下載數據、解析數據、保存數據。

本文提供一個代碼示例，分別展示這三步驟

1、下載數據

def dowmlpad(url, user_agent=wswp, proxy=None, num_retries=2, timeout=5):n """n # 支持500錯誤重試n # 設定用戶代理 user_agentn # 支持ip代理n """n print(DownloadURL:,url)nn #配置用戶代理n headers = {User-agent:user_agent}n request = urllib.request.Request(url, headers=headers)n #配置n opener = urllib.request.build_opener()nn #判斷是否代理n if proxy:n proxy_params = {urllib.parse.urlparse(url).scheme:proxy}n opener.add_handler(urllib.request.ProxyHandler(proxy_params))n try:n html = opener.open(request, timeout=timeout).read()n except urllib.request.URLError as e:n print(Download error:,e.reason)n html = Nonen if num_retries > 0:n if hasattr(e,code) and 500 <= e.code <600:n html = dowmlpad(url, user_agent, num_retries-1)n except Exception as e:n print(error :,e)n html = Nonenn return htmln

2、解析數據

#編寫爬取規則，獲得數據ndef scrape_callback(url,html):n csslist = [span[property = "v:itemreviewed"], span.year, strong[property="v:average"]]n try:n tree = lxml.html.fromstring(html)n row = [tree.cssselect({0}.format(field))[0].text for field in csslist]nn print(url, row)n except Exception as e:n print("ScrapeCallback error:",e)nn"""nseed_url:種子urlnlink_regex: 提取鏈接的正則表達式nmax_depath：提取鏈接的深度，默認為2爬蟲到達第二場頁面後不再提取鏈接，對於種子頁面取出的鏈接頁面，就是第二層，nscrape_callback：回掉函數n"""ndef link_crawler(seed_url, link_regex, max_depath=2, scrape_callback=None):n crawl_queue = [seed_url] #配置爬取隊列，其實就是一個存儲url的列表n #seen = set(crawl_queue)n seens = {seed_url:1}nn # 循環直到隊列為空退出n while crawl_queue:n url = crawl_queue.pop() # 移除隊列最後一個元素，並返回值n html = dowmlpad(url) # 根據url 下載頁面n depth = seens[url] # 獲得url深度n print(depth)nn #獲取頁面中的鏈接n for link in get_links(html):n if depth != max_depath and re.search(link_regex,link):n link = urllib.parse.urljoin(seed_url, link) #組裝規範鏈接nn #添加鏈接到爬取隊列中n if link not in seens:n seens[link] = depth+1n crawl_queue.append(link)nn #如果處理回調函數存在，則進行回調處理n if scrape_callback:n scrape_callback(url, html)n

3、保存數據

import csvnclass ScrapeCallback:n def __init__(self):n self.writer = csv.writer(open(countries.csv,w))n self.fields = (name,year,score)n self.writer.writerow(self.fields)nn def __call__(self, url,html):n csslist = [span[property = "v:itemreviewed"], span.year,strong[property="v:average"]]n try:n tree = lxml.html.fromstring(html)n row = [tree.cssselect({0}.format(field))[0].text for field in csslist]n self.writer.writerow(row)n print(url, row)n except Exception as e:n print("ScrapeCallback error:",e)n

最後主函數

if __name__ == __main__:n #測試n send_url = "https://movie.douban.com/"nn link_regex = (/subject/[d]+/) #獲取鏈接的規則nn #使用類的方式來寫，下面兩個一樣結果n link_crawler(send_url,link_regex,max_depath=2, scrape_callback=ScrapeCallback())n #link_crawler(send_url, link_regex, max_depath=2, scrape_callback=scrape_callback)n

這裡只是大致展示了框架，更詳細的注釋可以去zhangslob/Python-General-Spider

爬蟲其實很簡單，不要過於糾結方法，記住三步：下載數據、解析數據、保存數據

歡迎加微信交流