使用Python的Scrapy框架爬取51job職位

Python版本是3.6,爬取後保存在MySQL中,版本是5.5。

51job搜索位置的鏈接是【數據分析師招聘,求職】-前程無憂,這個url比較厲害,找了半天才找到頁碼的位置。

首先是可以在ide中運行scrapy的文件run.py:

from scrapy.cmdline import executeexecute([scrapy, crawl, job51])

存儲的欄位item.py:

import scrapyclass Job51Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() zhiwei = scrapy.Field() gongsi = scrapy.Field() didian = scrapy.Field() xinzi = scrapy.Field() gongsileixing = scrapy.Field() guimo = scrapy.Field() hangye = scrapy.Field() jingyan = scrapy.Field() xueli = scrapy.Field() fuli = scrapy.Field() zhiweiyaoqiu = scrapy.Field() lianjie = scrapy.Field()

爬蟲入口 job51.py:

import reimport scrapyfrom bs4 import BeautifulSoupfrom items import Job51Itemclass Myspider(scrapy.Spider): name = job51 allowed_domains = [51job.com] headers = {User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36} def start_requests(self): for i in range(1, 208): url_1 = http://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2, url_2 = .html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= url = url_1 + str(i) + url_2 yield scrapy.Request(url, headers=self.headers, callback=self.parse) #獲取每個職位的詳細信息入口url def parse(self, response): soup = BeautifulSoup(response.text, lxml) tls = soup.find_all(p, class_=t1 ) for tl in tls: url = tl.find(a, target=_blank)[href] yield scrapy.Request(url, callback=self.get_content, meta={url: url}) #用BeautifulSoup爬取詳細頁欄位,有一些公司的爬取頁是自己做的,這裡就不好爬。 職位要求部分有的公司比較亂,結果有些字元還要處理。 def get_content(self, response): soup = BeautifulSoup(response.text, lxml) item = Job51Item() item[zhiwei] = soup.find(h1).get_text().replace(xa0, ) item[gongsi] = soup.find(p, class_=cname).find(a, target=_blank).get_text().replace(xa0, ) item[didian] = soup.find(span, class_=lname).get_text().replace(xa0, ) item[xinzi] = soup.find(div, class_=cn).find(strong).get_text().replace(xa0, ) gongsixinxi = soup.find(p, class_=msg ltype).get_text().replace( , ).replace(
, ).replace(
, ).replace(xa0, ) item[gongsileixing] = gongsixinxi.split(|)[0] item[guimo] = gongsixinxi.split(|)[1] item[hangye] = gongsixinxi.split(|)[2] zhaopinyaoqiu = soup.find(div, class_=t1).get_text().replace(xa0, ) item[jingyan] = zhaopinyaoqiu.split(
)[1] try: item[xueli] = re.findall(r<em class="i2"></em>(.*?)</span>, response.text)[0] except: item[xueli] = 無 try: item[fuli] = soup.find(p, class_=t2).get_text().replace(
, ).replace(xa0, ) except: item[fuli] = 無 item[zhiweiyaoqiu] = re.findall(r<div class="bmsg job_msg inbox">(.*?)<div class="mt10">, response.text, re.I|re.S|re.M)[0] .replace(
, ).replace(
, ).replace( , ).replace(xa0, ).replace(<br>, ) .replace(<br/>, ).replace(<P>, ).replace(</P>, ).replace(?, ).replace(<p>, ).replace(</p>, ) .replace(<div>, ).replace(</div>, ).replace(<BR>, ).replace(</BR>, ) item[lianjie] = response.meta[url] yield item

SQL部分sql.py:

import pymysql.cursorsimport settingsMYSQL_HOSTS = settings.MYSQL_HOSTSMYSQL_USER = settings.MYSQL_USERMYSQL_PASSWORD = settings.MYSQL_PASSWORDMYSQL_PORT = settings.MYSQL_PORTMYSQL_DB = settings.MYSQL_DBcnx = pymysql.connect( host=MYSQL_HOSTS, port=MYSQL_PORT, user=MYSQL_USER, passwd=MYSQL_PASSWORD, db=MYSQL_DB, charset=gbk)cur = cnx.cursor()class Sql: @classmethod def insert_job51(cls, zhiwei, gongsi, didian, xinzi, gongsileixing, guimo, hangye, jingyan, xueli, fuli, zhiweiyaoqiu, lianjie): sql = INSERT INTO job51(zhiwei,gongsi,didian,xinzi,gongsileixing,guimo,hangye,jingyan,xueli,fuli,zhiweiyaoqiu,lianjie) VALUES(%(zhiwei)s,%(gongsi)s,%(didian)s,%(xinzi)s,%(gongsileixing)s,%(guimo)s,%(hangye)s,%(jingyan)s,%(xueli)s,%(fuli)s,%(zhiweiyaoqiu)s,%(lianjie)s) value = {zhiwei: zhiwei, gongsi: gongsi, didian: didian, xinzi: xinzi, gongsileixing: gongsileixing, guimo: guimo, hangye: hangye, jingyan: jingyan, xueli: xueli, fuli: fuli, zhiweiyaoqiu: zhiweiyaoqiu, lianjie: lianjie,} cur.execute(sql, value) cnx.commit()

管道存儲pipelines.py:

from .sql import Sqlfrom items import Job51Itemclass Job51Pipeline(object): def process_item(self, item, spider): zhiwei = item[zhiwei] gongsi = item[gongsi] didian = item[didian] xinzi = item[xinzi] gongsileixing = item[gongsileixing] guimo = item[guimo] hangye = item[hangye] jingyan = item[jingyan] xueli = item[xueli] fuli = item[fuli] zhiweiyaoqiu = item[zhiweiyaoqiu] lianjie = item[lianjie] Sql.insert_job51(zhiwei, gongsi, didian, xinzi, gongsileixing, guimo, hangye, jingyan, xueli, fuli, zhiweiyaoqiu, lianjie) print(寫入職位信息)

首先在MySQL中建立新表,包含所有欄位。

CREATE TABLE `job51` ( `id` int(10) NOT NULL AUTO_INCREMENT, `zhiwei` varchar(255) DEFAULT NULL, `gongsi` varchar(255) DEFAULT NULL, `didian` varchar(255) DEFAULT NULL, `xinzi` varchar(255) DEFAULT NULL, `gongsileixing` varchar(255) DEFAULT NULL, `guimo` varchar(255) DEFAULT NULL, `hangye` varchar(255) DEFAULT NULL, `jingyan` varchar(255) DEFAULT NULL, `xueli` varchar(255) DEFAULT NULL, `fuli` varchar(255) DEFAULT NULL, `zhiweiyaoqiu` text, `lianjie` text, PRIMARY KEY (`id`)) ENGINE=InnoDB DEFAULT CHARSET=gbk

配置settings.py:

BOT_NAME = job51SPIDER_MODULES = [job51.spiders]NEWSPIDER_MODULE = job51.spidersMYSQL_HOSTS = localhostMYSQL_USER = rootMYSQL_PASSWORD = 123456MYSQL_PORT = 3306MYSQL_DB = job51ROBOTSTXT_OBEY = TrueITEM_PIPELINES = { job51.pipelines.Job51Pipeline: 300,}HTTPCACHE_ENABLED = TrueHTTPCACHE_EXPIRATION_SECS = 0HTTPCACHE_DIR = httpcacheHTTPCACHE_IGNORE_HTTP_CODES = []HTTPCACHE_STORAGE = scrapy.extensions.httpcache.FilesystemCacheStorage

最後一共是9705條信息,分析改日再弄。


推薦閱讀:

MySQL資料庫應用總結(四)—MySQL資料庫表的基本操作(上)
為什麼 PostgreSQL 在國內流行度遠不如 MySQL,主要是哪些方面的原因造成的?
從Mysql邁入資料庫
大家設計資料庫時使用外鍵嗎?

TAG:MySQL | python爬蟲 |