scrapy爬取妹子圖

10-09

scrapy爬取妹子圖

一、編寫item文件，根據需要爬取的內容定義爬取欄位----items.py

import scrapyclass MeizituItem(scrapy.Item): img_name = scrapy.Field() # 圖集的名字 img_link = scrapy.Field() # 圖集的url img_number = scrapy.Field() # 幾張圖片 detail_img = scrapy.Field() # url.jpg

二、編寫spider中的爬蟲文件----meizi.py

# -*- coding: utf-8 -*-import scrapyfrom Meizitu.items import MeizituItemclass MeiziSpider(scrapy.Spider): name = meizi allowed_domains = [mzitu.com] start_urls = [http://mzitu.com/] def parse(self, response): node_list = response.xpath("//ul[@id=pins]/li") for node in node_list: item = MeizituItem() item[img_name] = node.xpath(./span/a/text()).extract_first() item[img_link] = node.xpath(./span/a/@href).extract_first() # print(item[img_link]) yield scrapy.Request(url=item[img_link], callback=self.detail_page,meta={"item": item}) # yield scrapy.Request(url=) def detail_page(self, response): item = response.meta["item"] item["img_number"] = response.xpath(//div[@class="pagenavi"]/a[5]/span/text()).extract_first() # 照片張數 for i in range(1, int(item["img_number"]) + 1): # print(item["img_link"]) yield scrapy.Request(url=item["img_link"] + /+str(i),callback=self.page_img,meta={"item": item}) # print(type(item["img_number"])) def page_img(self, response): item = response.meta["item"] item[detail_img] = response.xpath(//div[@class="main-image"]/p/a/img/@src).extract_first() #匹配圖片的url # print(all_img) yield item

三、編寫pipelines文件

# -*- coding: utf-8 -*-# Define your item pipelines here## Dont forget to add your pipeline to the ITEM_PIPELINES setting# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport jsonimport osimport scrapyfrom scrapy.pipelines.images import ImagesPipelinefrom Meizitu.settings import IMAGES_STOREfrom scrapy.exceptions import DropItemclass ImgPipeline(ImagesPipeline): def get_media_requests(self, item, info): # 獲取item數據的圖片鏈接 image_link = item[detail_img] # 發送圖片的請求。響應會保存在setting里指定的路徑 yield scrapy.Request(url=image_link) def item_completed(self, results, item, info): image_paths = [x[path] for ok, x in results if ok] old_path = IMAGES_STORE + / + image_paths[0] # print(old_path + "===========================") new_path = IMAGES_STORE + / + item["detail_img"][-9:-5] + .jpg item[detail_img] = new_path print(new_path) try: os.rename(old_path,new_path) except: print(已修改...) return itemclass MeizituPipeline(object): def open_spider(self,spider): self.file = open(meizitu.json, w, encoding=utf-8) def process_item(self, item, spider): data = json.dumps(dict(item), ensure_ascii=False) + self.file.write(data) return item def close_spider(self,spider): self.file.close()

四、settings文件設置

# -*- coding: utf-8 -*-# Scrapy settings for Meizitu project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:## https://doc.scrapy.org/en/latest/topics/settings.html# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html# https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = MeizituSPIDER_MODULES = [Meizitu.spiders]NEWSPIDER_MODULE = Meizitu.spidersIMAGES_STORE = /home/pyvip/Meizitu/Meizitu/spiders/meizitu# Crawl responsibly by identifying yourself (and your website) on the user-agentUSER_AGENTS = [ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3192.0 Safari/537.36Name" ]# Obey robots.txt rulesROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:# DEFAULT_REQUEST_HEADERS = {# Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8,# Accept-Encoding:gzip, deflate,# Accept-Language:zh-CN,zh;q=0.8,# Cache-Control:max-age=0,# Connection:keep-alive,# Host:www.mzitu.com,## Upgrade-Insecure-Requests:1,# User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4094.1 Safari/537.36,## }# Enable or disable spider middlewares# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# Meizitu.middlewares.MeizituSpiderMiddleware: 543,#}# Enable or disable downloader middlewares# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.htmlDOWNLOADER_MIDDLEWARES = { Meizitu.middlewares.MeiZiTuSpiderMiddleware: 543,}# Enable or disable extensions# See https://doc.scrapy.org/en/latest/topics/extensions.html#EXTENSIONS = {# scrapy.extensions.telnet.TelnetConsole: None,#}# Configure item pipelines# See https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = { Meizitu.pipelines.MeizituPipeline: 300, Meizitu.pipelines.ImgPipeline: 301,}# Enable and configure the AutoThrottle extension (disabled by default)# See https://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = httpcache#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = scrapy.extensions.httpcache.FilesystemCacheStorage

運行結果：