根據小甲魚第56課編寫的爬取煎蛋網mm圖python2.7小程序
# -*- coding: utf-8 -*-nimport urllib2nimport osnimport timenndef url_open(url):n req = urllib2.Request(url)n req.add_header(User-Agent,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36)nn# 有代理訪問 n# proxies = []n# proxy = random.choice(proxies)n# proxy_handler = urllib2.ProxyHandler({http: random.choice(iplist)})n# opener = urllib2.build_opener(proxy_handler)n# html = opener.open(url).read()nn# 無代理訪問 n response = urllib2.urlopen(req)n html = response.read()nn return htmlnndef get_page(url):n html = url_open(url).decode(utf-8)nn a = html.find(current-comment-page) + 23n b = html.find(],a)nn return html[a:b]nndef find_imgs(url):n html = url_open(url).decode(utf-8)n img_addrs = []nn a = html.find(img src=)n while a!=-1:n b = html.find(.jpg,a,a+255)n if b != -1:n img_addrs.append("http:" + html[a+9:b+4])n else:n b = a + 9nn a = html.find(img src=,b)nn return img_addrsnndef save_imgs(folder,img_addrs):n for each in img_addrs:n filename = each.split(/)[-1]n with open(filename,wb) as f:n img = url_open(each)n time.sleep(5)n f.write(img)n nnndef download_mm(folder=ooxx,pages=10):n if not os.path.exists(folder):n os.mkdir(folder)n os.chdir(folder)nn url = "http://jandan.net/ooxx/"n page_num = int(get_page(url))nn for i in range(pages):n page_num -= in page_url = url + page- + str(page_num) + #commentsn img_addrs = find_imgs(page_url)n save_imgs(folder,img_addrs)nnif __name__ == __main__:n download_mm()n
執行結果:
推薦閱讀:
※python requests 發送中文參數的問題
※用 python 對 github 用戶 followers 分析
※根據小甲魚60課編寫的爬取代理IP代碼(python2.7)
TAG:python爬虫 |