Python爬取上萬張QQ頭像圖片

07-29

Python爬取上萬張QQ頭像圖片

來自專欄網路進化論7 人贊了文章

最近因一個朋友公司需要上萬張頭像圖片，人工去網站上下載將是一個無聊且龐大的工程，於是就寫了這個代碼，該代碼屬於標準標準爬去靜態網頁框架適用很多靜態網頁爬取。

運行環境：

Kali Linux

Python3.5

需要用到的模塊:

urllib.request

urllib.parse

re,time,sys

BeautifulSoup

註：除了BeautifulSoup模塊需要使用pip3 install BeautifulSoup，其他模塊都是Python自帶的標準庫

Python文件列表:

spider_main.py

url_manager.py

html_downloader.py

html_parser.py

html_outputer.py

爬取目標網站：

主頁面：http://http://www.woyaogexing.com/

代碼演示：

spider.py #調度程序

import urllib.requestfrom http import cookiejarfrom bs4 import BeautifulSoupimport re,timeimport url_manager,html_downloader,html_parser,html_outputerimport re,sys# sys.setrecursionlimit(10500)class SpiderMain(): count = 1 def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() def craw(self, root_url): count = 1 self.urls.add_new_url(root_url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() print (craw %d: %s % (count, new_url)) html_cont = self.downloader.download(new_url) new_urls = self.parser.parse(new_url, html_cont) self.urls.add_new_urls(new_urls) if count == 1500: break count = count + 1 except Exception as e: # time.sleep(1) print (craw failed + str(e)) print (Downloading images...) self.outputer.get_img(html_cont)if __name__ == __main__: root_url = http://www.woyaogexing.com/ obj_spider = SpiderMain() obj_spider.craw(root_url)

url_manager.py

class UrlManager(): def __init__(self): self.new_urls = set() self.old_urls = set() def add_new_url(self,url): if url is None: return if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url) def add_new_urls(self,urls): if urls is None or len(urls) == 0: return for url in urls: self.add_new_url(url) def has_new_url(self): return len(self.new_urls) != 0 def get_new_url(self): new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_url

html_download.py

class HtmlDownloader(): def download(self,url): if url is None: return req = urllib.request.Request(url) req.add_header(User-Agent, Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36) response = urllib.request.urlopen(req) if response.getcode() != 200: return None html = response.read().decode(utf-8) return html

html_parser.py

class HtmlParser(): def _get_new_urls(self,page_url, soup): new_urls = set() links = soup.find_all(a, href=re.compile(r/touxiang/w+/2018/d+.html)) for link in links: new_url = link[href] new_full_url = urllib.parse.urljoin(page_url,new_url) new_urls.add(new_full_url) return new_urls def parse(self, page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont, html.parser) new_urls = self._get_new_urls(page_url, soup) return new_urls

html_outputer.py

class HtmlOutputer(): def __init__(self): self.html = [] def collect_data(self, html): print (html) if html is None: return self.html.append(html) def get_img(self, html): p = (r<a href="([^<]*.jpg)") # imglist = re.findall(p, self.html) for each in re.findall(p, html): filename = each.split("/")[-1] urllib.request.urlretrieve(each, filename, None)

執行效果：