如何使用爬蟲獲取新加坡PSI信息?

想對新加坡的PSI數據進行分析,Historical PSI Readings 中提取從2014年4月至今的PSI數據,就是網頁中間的那個表格,應該怎麼做呢?


瓶頸主要在硬碟IO上,30線程爬網頁 vs 單線程寫入.csv文件。
改成並發寫入資料庫應該會快很多。

有時剛開始就會出現報遞歸Error,可是我代碼中沒用遞歸啊。

Traceback (most recent call last):
File "C:Python 3.5libmultiprocessingqueues.py", line 241, in _feed
obj = ForkingPickler.dumps(obj)
File "C:Python 3.5libmultiprocessing
eduction.py", line 50, in dumps
cls(buf, protocol).dump(obj)
RecursionError: maximum recursion depth exceeded

翻了源碼發現是Queue在傳遞數據時序列化引起的,在Google了一番找到解決辦法,把遞歸深度設置大一點,比如一百萬:

sys.setrecursionlimit(1000000)

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: loveNight

import os
import sys
import csv
import time
import requests
import threading
from multiprocessing import Queue
from datetime import datetime, timedelta
from bs4 import BeautifulSoup as BS
from multiprocessing.dummy import Pool

sys.setrecursionlimit(1000000) # 遞歸深度,默認只有900
os.chdir(sys.path[0])

url_pattern = r"http://www.nea.gov.sg/anti-pollution-radiation-protection/air-pollution-control/psi/historical-psi-readings/year/{0}/month/{1}/day/{2}"
# 表頭
table_header = ["Year", "Month", "Day", "Time", "North",
"East", "West", "Central", "Overall Singapore"]

headers = {
"Accept-Encoding": "gzip,deflate,sdch",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
"Host": "www.nea.gov.sg",
}
session = requests.Session()
session.headers = headers
delay = 0 # 網路請求之間的間隔

QUIT = "Quit"
queue = Queue()

# 要下載的日期
dt = datetime(2014, 4, 1)
dt_now = datetime.now()
todo = []
while dt &<= dt_now: todo.append(dt) dt += timedelta(days=1) # 打開網頁 def getPage(url): if delay: time.sleep(delay) return session.get(url).text # 寫入文件 def save(filename): start = time.time() with open(filename, "w", newline="") as output: writer = csv.writer(output) writer.writerow(table_header) while True: lines = queue.get() if isinstance(lines, str) and lines == QUIT: break else: print("拿到數據,正在寫入", datetime.now()) writer.writerows(lines) print("寫入完成!用時 %s " % (time.time() - start)) # 解析指定日期的頁面 def resolvePage(dt): year = dt.year month = dt.month day = dt.day html = getPage(url_pattern.format(year, month, day)) soup = BS(html, "lxml") # 需要安裝第三方庫lxml,也可以使用自帶的html.parser table = soup.find(name="table", class_="text_psinormal") if table: trs = table.find_all("tr") trs = trs[2:] # 去除表頭 lines = [] for tr in trs: datas = [year, month, day] + [x for x in tr.strings if x != " "] lines.append(datas) queue.put(lines) # 傳入整張表 # 開始下載 filename = "data.csv" t = threading.Thread(target=save, args=(filename,)) t.daemon = True t.start() pool = Pool(30) pool.map(resolvePage, todo) pool.close() pool.join() queue.put(QUIT)

用時127秒。

然後用Excel打開腳本同目錄下的data.csv,按Year, Month, Day, Time排序就行了,一共1萬4千多行。


蟹腰
先上代碼:

# -*- coding: utf-8 -*-
import requests
import re
import datetime
import os
import csv
class Spider():
def __init__(self):
self.url=u"http://www.nea.gov.sg/anti-pollution-radiation-protection/air-pollution-control/psi/historical-psi-readings/year/yearNum/month/monthNum/day/dayNum"
self.headers={
"Host": "www.nea.gov.sg",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0",
"Connection": "keep-alive"
}
zuhe1=u"&.*?"
zuhe2=u"&(.*?)&.*?"
zuhe3=u"&
"
self.zuhe=zuhe1+zuhe2*7+zuhe3

def tool(self,x):
x=re.sub(re.compile("&<.*?&>"),"",x)
x=re.sub(re.compile("
"),"",x)
x=re.sub(re.compile("
"),"",x)
x=x.encode("utf-8")
return x.strip()

def handleDate(self,year,month,day):
#返回日期數據
date=datetime.date(year,month,day)
# print date.datetime.datetime.strftime("%Y-%m-%d")
return date #日期對象

def timeDelta(self,year,month):
#計算一個月有多少天
date=datetime.date(year,month,1)
try:
date2=datetime.date(date.year,date.month+1,date.day)
except:
date2=datetime.date(date.year+1,1,date.day)
dateDelta=(date2-date).days
return dateDelta

def getPageContent(self,date):
url=self.url
url=url.replace(u"yearNum",str(date.year))
url=url.replace(u"monthNum",str(date.month))
url=url.replace(u"dayNum",str(date.day))
# print url
r=requests.get(url)
r.encoding="utf-8"
pageContent=r.text
# f=open("content.html","w")
# f.write(pageContent.encode("utf-8"))
# f.close()
return pageContent

def getPageInfos(self,pageContent):
pattern1=re.compile(u"&(.*?)&",re.S)
result1=re.search(pattern1,pageContent)
content1=result1.group(1)
pattern2=re.compile(self.zuhe,re.S)
infos=re.findall(pattern2,content1)
return infos

def saveInfo(self,info,date):
fileName= "psi/"+datetime.datetime.strftime(date,"%Y")+"/"+datetime.datetime.strftime(date,"%m")+"/"+datetime.datetime.strftime(date,"%d")+".csv"
if os.path.exists(fileName):
mode="ab"
else:
mode="wb"
csvfile=file(fileName,mode)
writer=csv.writer(csvfile)
# if mode=="wb":
# writer.writerow(self.rowName)
writer.writerow([self.tool(i) for i in info])
csvfile.close()

def mkdir(self,date):
#創建目錄
path = "psi/"+datetime.datetime.strftime(date,"%Y")+"/"+datetime.datetime.strftime(date,"%m")
isExists=os.path.exists(path)
if not isExists:
os.makedirs(path)

def saveAllInfo(self,infos,date):
for (i,info) in enumerate(infos):
self.mkdir(date)
self.saveInfo(info,date)
# print "save info from link",i+1,"/",len(infos)

流程就是分三步走:打開網頁——獲取信息——保存信息
考慮到每個月天數不同,所以要先計算每個月的天數
有的日期會多出PM2.5的信息(下圖所示),我的處理是略去

運行代碼:

# -*- coding: utf-8 -*-
#新加坡psi 2014-1-1~today 的爬蟲
import spider
s=spider.Spider()
#日期
year=2015
month=3
delta=s.timeDelta(year,month)
#一個月一個月的抓取
print year,month,"strat!"
for day in range(1,delta+1):
#日期
date=s.handleDate(year,month,day)
#網頁內容
pageContent=s.getPageContent(date)
#獲得信息
infos=s.getPageInfos(pageContent)
# #保存信息
s.saveAllInfo(infos,date)
print day,"/",delta
day+=1
print year,month,"end!"

運行過程:

數據保存的路徑:
psi/year/month/day.csv合併數據的代碼:

# -*- coding: utf-8 -*-
import os
for yearNum in [2014,2015]:
listMonth=os.listdir("psi/"+str(yearNum)+"/")
for monthNum in listMonth:
openPath="psi/"+str(yearNum)+"/"+str(monthNum)+"/"
listFileName=os.listdir(openPath)
for fileName in listFileName:
a=open(openPath+fileName,"r")
content=a.read()
a.close()
if os.path.exists(u"hebing.csv"):
mode="a"
else:
mode="w"
b=open(u"hebing.csv",mode)
b.write(content)
b.close()

嗯,就是這樣,獻醜啦
代碼好醜,,,,


#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2015-11-10 16:21:59
import requests
from bs4 import BeautifulSoup
url = "http://www.nea.gov.sg/anti-pollution-radiation-protection/air-pollution-control/psi/historical-psi-readings/year/2014/month/4/day/1"

req = requests.get(url)

soup = BeautifulSoup(req.text, "html.parser")

PSI_infos = soup.find("table", class_="text_psinormal").find(
"tbody").find_all("tr")

for info in PSI_infos:
items = info.find_all("span", id=True)
for item in items:
print item.get_text(),
print

輸出:

1am 55 54 54 58 54 54-58
2am 55 54 54 59 54 54-59

3am 55 55 54 60 55 54-60
4am 56 55 55 62 55 55-62

5am 57 55 56 63 56 55-63
6am 58 56 56 64 56 56-64

7am 59 57 57 65 57 57-65
8am 60 57 58 66 58 57-66

9am 59 57 58 66 59 57-66
10am 60 58 58 67 59 58-67

11am 60 57 59 67 59 57-67
12pm 60 57 58 67 59 57-67

1pm 59 56 58 67 59 56-67
2pm 59 56 58 67 59 56-67

3pm 59 56 57 67 59 56-67
4pm 58 55 57 66 59 55-66

5pm 58 55 58 66 59 55-66
6pm 57 55 57 66 59 55-66

7pm 57 55 57 66 59 55-66
8pm 58 56 57 66 59 56-66

9pm 58 56 57 66 59 56-66
10pm 58 56 57 65 58 56-65

11pm 58 56 57 65 58 56-65
12am 58 55 57 64 58 55-64

[Finished in 0.6s]


@張天
在Firefox裡面用F12看了下,有幾個js腳本獲取失敗,可以看到域名指向了谷歌,可能爬蟲裡面要想辦法把這幾個js腳本自己補上(至少jquery.min.js是很常見的,其他的可以再搜搜)


推薦閱讀:

爬蟲是不是用 Node.js 更好?
為什麼寫的爬蟲只能爬取一幅圖,而不能全部下載所有圖片?
要怎麼樣的訓練才能在PAT甲級考到八九十分?
下廚房使用python的技術細節能詳細介紹下嗎??
python 關於 for循環 命名空間和變數作用域的一個疑問?

TAG:Python | 數據分析 | 爬蟲計算機網路 |