一個通用爬蟲思路(Python3)
也可以來這看 一個通用爬蟲思路(Python3)
其實一個爬蟲無非就三步:下載數據、解析數據、保存數據。
本文提供一個代碼示例,分別展示這三步驟
1、下載數據
def dowmlpad(url, user_agent=wswp, proxy=None, num_retries=2, timeout=5):n """n # 支持500錯誤重試n # 設定用戶代理 user_agentn # 支持ip代理n """n print(DownloadURL:,url)nn #配置用戶代理n headers = {User-agent:user_agent}n request = urllib.request.Request(url, headers=headers)n #配置n opener = urllib.request.build_opener()nn #判斷是否代理n if proxy:n proxy_params = {urllib.parse.urlparse(url).scheme:proxy}n opener.add_handler(urllib.request.ProxyHandler(proxy_params))n try:n html = opener.open(request, timeout=timeout).read()n except urllib.request.URLError as e:n print(Download error:,e.reason)n html = Nonen if num_retries > 0:n if hasattr(e,code) and 500 <= e.code <600:n html = dowmlpad(url, user_agent, num_retries-1)n except Exception as e:n print(error :,e)n html = Nonenn return htmln
2、解析數據
#編寫爬取規則,獲得數據ndef scrape_callback(url,html):n csslist = [span[property = "v:itemreviewed"], span.year, strong[property="v:average"]]n try:n tree = lxml.html.fromstring(html)n row = [tree.cssselect({0}.format(field))[0].text for field in csslist]nn print(url, row)n except Exception as e:n print("ScrapeCallback error:",e)nn"""nseed_url:種子urlnlink_regex: 提取鏈接的正則表達式nmax_depath:提取鏈接的深度,默認為2爬蟲到達第二場頁面後不再提取鏈接 ,對於種子頁面取出的鏈接頁面,就是第二層,nscrape_callback:回掉函數n"""ndef link_crawler(seed_url, link_regex, max_depath=2, scrape_callback=None):n crawl_queue = [seed_url] #配置爬取隊列,其實就是一個存儲url的列表n #seen = set(crawl_queue)n seens = {seed_url:1}nn # 循環直到隊列為空退出n while crawl_queue:n url = crawl_queue.pop() # 移除隊列最後一個元素,並返回值n html = dowmlpad(url) # 根據url 下載頁面n depth = seens[url] # 獲得url深度n print(depth)nn #獲取頁面中的鏈接n for link in get_links(html):n if depth != max_depath and re.search(link_regex,link):n link = urllib.parse.urljoin(seed_url, link) #組裝規範鏈接nn #添加鏈接到爬取隊列中n if link not in seens:n seens[link] = depth+1n crawl_queue.append(link)nn #如果處理回調函數存在,則進行回調處理n if scrape_callback:n scrape_callback(url, html)n
3、保存數據
import csvnclass ScrapeCallback:n def __init__(self):n self.writer = csv.writer(open(countries.csv,w))n self.fields = (name,year,score)n self.writer.writerow(self.fields)nn def __call__(self, url,html):n csslist = [span[property = "v:itemreviewed"], span.year,strong[property="v:average"]]n try:n tree = lxml.html.fromstring(html)n row = [tree.cssselect({0}.format(field))[0].text for field in csslist]n self.writer.writerow(row)n print(url, row)n except Exception as e:n print("ScrapeCallback error:",e)n
最後主函數
if __name__ == __main__:n #測試n send_url = "https://movie.douban.com/"nn link_regex = (/subject/[d]+/) #獲取鏈接的規則nn #使用類的方式來寫,下面兩個一樣結果n link_crawler(send_url,link_regex,max_depath=2, scrape_callback=ScrapeCallback())n #link_crawler(send_url, link_regex, max_depath=2, scrape_callback=scrape_callback)n
這裡只是大致展示了框架,更詳細的注釋可以去zhangslob/Python-General-Spider
爬蟲其實很簡單,不要過於糾結方法,記住三步:下載數據、解析數據、保存數據
歡迎加微信交流
推薦閱讀: