pyhton新聞頁正文抽取

02-05

# -*- coding: utf-8 -*-nnGoGo闖@流量販子：新聞頁正文抽取v1.0nnimport requests,multiprocessing,re,sysnimport MySQLdb as mdbnnreload(sys)nsys.setdefaultencoding(utf-8)nnDBUG = 0nnreBODY =re.compile( r<body.*?>([sS]*?)</body>, re.I)nreBODY2 =re.compile( r<script.*?>([sS]*?)</script>, re.I)nreBODY3 = re.compile(r<style.*?>([sS]*?)</style>,re.I)nreBODY4 = re.compile(r{[sS]*},re.I)nreCOMM = rnndef search(req,html):n text = re.search(req,html)n if text:n data = text.group(1)n else:n data = non return datannclass Extractor():n def __init__(self, url = "", blockSize=3, timeout=5, image=False):n self.url = urln self.blockSize = blockSizen self.timeout = timeoutn self.saveImage = imagen self.rawPage = ""n self.ctexts = []n self.cblocks = []nn def getRawPage(self):nn host = search(^([^/]*?)/,re.sub(r(https|http)://,,self.url))nn headers = {n "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",n "Accept-Encoding":"gzip, deflate, sdch",n "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",n "Cache-Control":"no-cache",n "Connection":"keep-alive",n "Host":host,n "Pragma":"no-cache",n "Upgrade-Insecure-Requests":"1",n "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",n }nn proxyHost = "proxy.abuyun.com"n proxyPort = "9010"nn # 代理隧道驗證信息n proxyUser = "天王蓋地虎"n proxyPass = "褲衩遮不住"nn proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {n "host" : proxyHost,n "port" : proxyPort,n "user" : proxyUser,n "pass" : proxyPass,n }nn proxies = {n "http" : proxyMeta,n "https" : proxyMeta,n }nn try:n f = requests.get(self.url,headers=headers,timeout=10)n except Exception as e:n raise enn code = f.status_coden content = f.contentnn 修改python2這個王八蛋使用request對網頁編碼誤識別為iso-8859-1的BUGn if f.encoding.lower() != utf-8:n charset = re.compile(rcontent="text/html;.?charset=(.*?)").findall(content)n coding = f.encoding.lower()n print coding, f.headers[content-type]n try:n if len(charset) > 0 and charset[0].lower() != coding:n content = content.decode(gbk).encode(utf-8)n elif coding == gbk or coding == gb2312 or coding == iso-8859-1:n content = content.decode(gbk).encode(utf-8)n except:n passn self.title = search("<title>([sS]*?)</title>",content).strip()n return code,contentnn def processTags(self):nn self.body = re.sub(reBODY, "", self.body)n self.body = re.sub(reBODY2, "", self.body)n self.body = re.sub(reBODY3,"", self.body)n self.body = re.sub(reBODY4,"", self.body)n self.body = re.sub(reCOMM, "", self.body)n self.body = re.sub(r<(?!p|/p)[^<>]*?>|下一篇.*,,self.body)n self.body = re.sub(r<p[^>]*?>,<p>,self.body)n #self.body = re.sub(reTAG, "", self.body)n self.body = re.sub(r[trfv],,self.body)nn 抽取圖片n self.img = search(r<img[sS]*?src=[|"]([sS]*?)[|"][sS]*?>,self.body)n if http not in self.img:n self.img = <img src="%s%s" > % (search(^([^/]*?)/,re.sub(r(https|http)://,,self.url)),self.img)nn def processBlocks(self):n self.ctexts = self.body.split("n")n self.textLens = [len(text) for text in self.ctexts]nn self.cblocks = [0]*(len(self.ctexts) - self.blockSize - 1)n lines = len(self.ctexts)n for i in range(self.blockSize):n self.cblocks = list(map(lambda x,y: x+y, self.textLens[i : lines-1-self.blockSize+i], self.cblocks))nn maxTextLen = max(self.cblocks)nn if DBUG: print(maxTextLen)nn self.start = self.end = self.cblocks.index(maxTextLen)n while self.start > 0 and self.cblocks[self.start] > min(self.textLens):n self.start -= 1n while self.end < lines - self.blockSize and self.cblocks[self.end] > min(self.textLens):n self.end += 1nn content = "".join(self.ctexts[self.start:self.end])n return contentnn def getContext(self):n code, self.rawPage = self.getRawPage()n self.body = re.findall(reBODY, self.rawPage)[0]nn if DBUG: print(code, self.rawPage)nn self.processTags()n return self.title,self.processBlocks()nndef getZwIndex(url):n # if __name__ == __main__:n ext = Extractor(url=url,blockSize=1, image=False)n return ext.getContext() nnn# pool = multiprocessing.Pool(processes=3)n# for url in url_list:n# pool.apply_async(getIndex, (url, ))n# pool.close()n# pool.join()n