愛奇藝人物信息scrapy-redis
只需要修改sprider的類型和setting就可以簡單的把普通的scrapy變成scrapy-redis框架下的爬蟲,就算中途停止可以繼續爬,信息保存在redis裡面,停止爬蟲可以看到redis裡面已爬和待爬的key有數據,待爬數據類型是zset,已爬是set。所有待爬爬完後,redis裡面數據會清空,兩個keys會被刪除。
總數據71300條左右,沒有遇到反爬。
主體部分:
# -*- coding: utf-8 -*- #import sys# reload(sys)# sys.setdefaultencoding(utf-8)import scrapyfrom ..items import IqiyiItemfrom . import crawl_basefrom . import crawl_moviesfrom . import crawl_relationshipfrom scrapy_redis.spiders import RedisSpiderimport scrapy_redis# class iqisprider(scrapy.Spider):class iqisprider(RedisSpider): meta_version=1.0 result_dir = /home/lyg/ruyi-scrapy/iqiyi name = iqiyi # redis_key = iqiyispider:start_urls custom_settings = { SCHEDULER : "scrapy_redis.scheduler.Scheduler", DUPEFILTER_CLASS : "scrapy_redis.dupefilter.RFPDupeFilter", # REDIS_HOST : localhost, # REDIS_PORT : 6379 DOWNLOADER_MIDDLEWARES : { # iqiyi.middlewares.IqiyiSpiderMiddleware:400 } } # start_urls=[http://www.iqiyi.com/lib/s_200019905.html] # test.txt裡面是人名,就一個 成龍 def start_requests(self): f = open("/home/lyg/ruyi-scrapy/iqiyi/doc/test.txt") line = f.readline() while line: url = "http://so.iqiyi.com/so/q_" + line.strip() yield scrapy.Request(url,callback=self.detail) line = f.readline() f.close() # 明星詳情信息入口 def detail(self,response): selector = scrapy.Selector(response) new=selector.xpath(//div[@class="info_item_bottom"]/a/@href).extract()[0].strip() yield scrapy.Request(new,callback=self.parse) #根據不同的信息部分,寫了不同的函數調用,看起來簡潔一點 def parse(self, response): item = IqiyiItem() item[url]=response.url item=crawl_base.get_base(response,item) item=crawl_movies.get_movie(response,item) [item,others]=crawl_relationship.get_rela(response,item) yield item # 根據人物關係爬取下一個人物 for i in others: yield scrapy.Request(http://+i,callback=self.parse)
crawl_base.py
import scrapydef get_base(response,item): selector = scrapy.Selector(response) item[name]=selector.xpath(//div[@class="result_detail"]/h1/text()).extract()[0] description = selector.xpath(//div[@class="mx_introduce-info"]/p/text()).extract_first() item[description] = description basic_info = selector.xpath(//div[@class="basic-info clearfix"]/dl/dd/text()).extract() item[UserLikes] = basic_info.pop().strip() item[agency] = basic_info.pop().strip() item[residence] = basic_info.pop().strip() item[constellation] = basic_info.pop().strip() item[place] = basic_info.pop().strip() item[bloodType] = basic_info.pop().strip() item[alternateName] = basic_info.pop().strip() item[famous_year] = basic_info.pop().strip() item[alumni] = basic_info.pop().strip() item[birthPlace] = basic_info.pop().strip() item[birthDate] = basic_info.pop().strip() item[height] = basic_info.pop().strip() item[gender] = basic_info.pop().strip() item[fname] = basic_info.pop().strip() return item
crawl_movies.py
import scrapy# from ..items import movie_infodef get_movie(response,item): selector=scrapy.Selector(response) tyes=selector.xpath(//div[@class="m-title-bl mb20 mt20 "]) detail=selector.xpath(//div[@class="m-title-bl mb20 mt20 "]//following-sibling::div[1]) n=0 movie=[] for ty in tyes: if ty.xpath(./@class="star-info-proj"): tye=ty.xpath(./h3/text()).extract_first() movies=detail[n].xpath(.//div[@class="wrapper-piclist"]/ul/li) for mo in movies: info = mo.xpath(.//*[@class="site-piclist_info_title"]) # movie裡面是多個字典,每個字典包含電影的名字,時間,類型信息 temp={} temp[name]=info.xpath(./a[1]/@title).extract_first() temp[time]=info.xpath(./a[2]/@title).extract_first() temp[type]=tye movie.append(temp) n=n+1 item[movie]=movie return item
crawl_relationship.py
import scrapy# from ..items import rela_infodef get_rela(response,item): selector = scrapy.Selector(response) relas=selector.xpath(//div[@class="m-relateStar"]//li) others=[] relaship=[] for rela in relas: temp={} temp[relaname]=rela.xpath(.//div[@class="relateStar_info"]/p[1]/a/@title).extract_first() temp[relaship]=rela.xpath(.//div[@class="relateStar_info"]/p[1]/span/em/text()).extract()[0].strip() relaship.append(temp) url=rela.xpath(.//div[@class="relateStar_info"]/p[1]/a/@href).extract_first().strip(/) others.append(url) # item = rela_info() item[relaship]=relaship return item,others
items
import scrapyclass IqiyiItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() url = scrapy.Field() item1=scrapy.Field() item2 = scrapy.Field() item3 = scrapy.Field()# class base_info(scrapy.Item): description=scrapy.Field() #描述 name=scrapy.Field()#名字 fname=scrapy.Field()#外文名字 alternateName=scrapy.Field()#曾用名 birthPlace=scrapy.Field()#出生地 birthDate=scrapy.Field()#生日 gender=scrapy.Field()#性別 height=scrapy.Field()#身高 weight=scrapy.Field()#體重 place=scrapy.Field()#地區 constellation=scrapy.Field()#星座 UserLikes=scrapy.Field()#愛好 bloodType=scrapy.Field()#血型 brokerageAgency=scrapy.Field()#國籍 alumni=scrapy.Field()#畢業院校 famous_year=scrapy.Field()#成名年份 residence=scrapy.Field()#現居地 agency=scrapy.Field()#經濟公司# class movie_info(scrapy.Item): moviename = scrapy.Field()#電影名字 movietime = scrapy.Field()#電影時間 movietyp = scrapy.Field()#電影類型 movie = scrapy.Field()#電影所有信息# class rela_info(scrapy.Item): relaname= scrapy.Field()#關係者名字 relaship= scrapy.Field()#關係
推薦閱讀:
※Scrapy學習實例(三)採集批量網頁
※如何高效學習python的某一個包?
※【爬蟲】用Scrapy做分散式爬蟲:1.環境搭建
※Python網頁信息採集:使用PhantomJS採集某貓寶商品內容
※小白進階之Scrapy第一篇