標籤:

爬取番號列表及dvd封面圖片

學習python三周,今天實戰演練了一下,問題多多,

爬蟲描述:爬取指定女優主界面,保存女優姓名及簡介,並根據主頁面下的每年出品鏈接,爬取鏈接內的番號 發布日期 及名稱 , 並將每個作品的封面 用番號名稱命名保存。

# -*- coding: gbk -*-import urllibimport urllib2import reimport osclass AVNY(object): def __init__(self,baseurl,newurl): self.URL = baseurl self.NEWURL = newurl def getPage(self): #獲取首頁 user_agent = Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3226.400 QQBrowser/9.6.11681.400 headers = {user_agent:user_agent} request = urllib2.Request(self.URL,headers=headers) reponse = urllib2.urlopen(request) page = reponse.read().decode(utf-8) return page def getName(self,page): #獲取女優名字 pattern = re.compile(r<div class="well_tit.*?<h1>(.*?)</h1>,re.S) title = re.search(pattern,page) return title.group(1) def getAbstract(self,page): #獲取女優簡介 pattern = re.compile(r<div class="well_tit.*?<.*?"avms">(.*?)</p>,re.S) abstract = re.search(pattern,page) return abstract.group(1) def getNewUrls(self,page): #獲取年份作品鏈接 pattern = re.compile(r<button.*?><a href="(.*?)">.*?</a></button>,re.S) NewUrls = re.findall(pattern,page) return NewUrls def getNewPage(self,url): #讀取年份作品鏈接頁面 user_agent = Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3226.400 QQBrowser/9.6.11681.400 headers = {user_agent:user_agent} request = urllib2.Request(url,headers=headers) reponse = urllib2.urlopen(request) newpage = reponse.read().decode(utf-8) return newpage def getMessages(self,newpage): #獲取年份作品具體信息 pattern = re.compile(r<div class="list_text">.*?<a href.*?[0-9]/(.*?).html">.*?<date>(.*?)</date><p>(.*?)</p>,re.S) Messages = re.findall(pattern,newpage) contents = [] for message in Messages: contents.append(" ___ ".join([message[0],message[1],message[2]])) return contents def saveBrief (self,page): #將女優名字,作品信息寫入文檔 filename = self.getName(page)+.txt abstract = self.getAbstract(page) txt = open(filename,"w") txt.write(abstract) txt.write(
) for Newurl in self.getNewUrls(page): digiturl = self.NEWURL+Newurl newpage = self.getNewPage(digiturl) line = self.getMessages(newpage) for i in line: txt.write(i) txt.write(
) txt.close() print "txt is done" def getImg(self,newpage): #獲取年份作品下的圖片及番號,並以番號為key,對應圖片鏈接為value,保存為字典 pattern = re.compile(r<img data-original="(.*?)"></a></span>.*?<div class="list_text">.*?<a href.*?[0-9]/(.*?).html">,re.S) IMG = re.findall(pattern,newpage) IMGS = {} for i in IMG: IMGS[i[1]]=i[0] return IMGS def mkdir(self,path): path=path.strip() path=path.rstrip("") isExists=os.path.exists(path) if not isExists: os.makedirs(path) print path+ 創建成功 return True else: print path+ 目錄已存在 return False def saveIMGs(self,page,newpage): #保存圖片函數 mkpath=os.path.join(os.getcwd(),self.getName(page)) self.mkdir(mkpath) IMGS = self.getImg(newpage) for ImgName in IMGS: ImgUrl = IMGS[ImgName] name = os.path.join(mkpath,ImgName + ".jpg") urllib.urlretrieve(ImgUrl, name) print name,"is done" print one year is done def saveAllImgs(self,page): #保存所有鏈接內圖片 for Newurl in self.getNewUrls(page): digiturl = self.NEWURL+Newurl newpage = self.getNewPage(digiturl) self.saveIMGs(page,newpage) def strat(self): #啟動函數 page = self.getPage() self.saveBrief(page) self.saveAllImgs(page)baseurl = rhttp://nanrenvip.net/baishimolinainewurl = "http://nanrenvip.net"S = AVNY(baseurl,newurl)S.strat()待解決問題urlretrieve命令經常卡頓導致程序無法正常運行。

網站地址 nanrenvip.net/

推薦閱讀:

一行Python代碼能做什麼?
Stack Overflow 報告:Python 正在令人難以置信地增長!
處理缺失數據
selenium phantomjs select (下拉選擇框)
GeoPython,用於地質學的日常工作的 Python 工具集

TAG:Python |