標籤:

從零開始的Pixiv排行榜爬蟲

本來打算51寫個P站排行榜爬蟲的,但是high了三天就很尷尬。。。勉勉強強寫了一晚上終於寫好了。雖然還有很多坑沒填(png結尾的圖片還是用JPG保存了,重名圖片(已填),所需下載圖片其實為2張及其以上(已填)=====)

不過難點還是登陸問題,因為P站不登陸不給爬= =,這就非常尷尬了。

se = requests.session()
def __init__(self):
self.base_url = https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index#登陸地址
self.login_url = https://accounts.pixiv.net/login?lang=zh#login api
self.target_url = https://www.pixiv.net/ranking_area.php?type=detail&no=6#排行榜地址
self.main_url = http://www.pixiv.net#p站主頁
self.headers = {
Referer: https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index,
User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3013.3 Safari/537.36
}#構建headers,requests請求時發送,才能返回頁面
self.pixiv_id = XXXXXXXX@qq.com#賬號
self.password = XXXXXXXX#密碼
self.post_key = []
self.return_to = http://www.pixiv.net/

def login(self):
post_key_html = se.get(self.base_url, headers=self.headers).text
post_key_soup = BeautifulSoup(post_key_html, lxml)
self.post_key = post_key_soup.find(input)[value]
# 上面是去捕獲postkey
data = {
pixiv_id: self.pixiv_id,
password: self.password,
return_to: self.return_to,
post_key: self.post_key
}
se.post(self.login_url, data=data, headers=self.headers)

登陸完畢後主要就是邏輯問題了.(這兒有個坑就是P站圖片分展示圖片和original,所以需要找到原始圖片地址)

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
import time
import re
import random

se = requests.session()

class Pixiv():

def __init__(self):
self.base_url = https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index
self.login_url = https://accounts.pixiv.net/login?lang=zh
self.target_url = https://www.pixiv.net/ranking_area.php?type=detail&no=6
self.main_url = http://www.pixiv.net
self.headers = {
Referer: https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index,
User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3013.3 Safari/537.36
}
self.pixiv_id = ????@qq.com
self.password = ??????

self.post_key = []
self.return_to = http://www.pixiv.net/
self.rank = 0

def login(self):
post_key_html = se.get(self.base_url, headers=self.headers).text
post_key_soup = BeautifulSoup(post_key_html, lxml)
self.post_key = post_key_soup.find(input)[value]
# 上面是去捕獲postkey
data = {
pixiv_id: self.pixiv_id,
password: self.password,
return_to: self.return_to,
post_key: self.post_key
}
se.post(self.login_url, data=data, headers=self.headers)

def get_url(self):
html_rank = requests.get(self.target_url).text
soup = BeautifulSoup(html_rank, lxml)
list = soup.select("div > div.work_wrapper ")
for x in list:
y=str(x)
id=re.findall(r"href="(.+?)"",y,re.I)
url = "https://www.pixiv.net/"+id[0] # 獲取圖片的html
jump_to_html=se.get(url, headers=self.headers).text
img_soup = BeautifulSoup(jump_to_html, lxml)
img_info = img_soup.find(div, attrs={class, _layout-thumbnail ui-modal-trigger})
img_original = img_soup.find(img, attrs={class, original-image})
self.rank += 1
if img_original !=None: # 有些找不到url,continue會報錯
img_ori_url =re.findall(r"src="(.+?)"",str(img_original),re.I)[0]
self.download_img(img_info, url,img_ori_url) # 去下載這個圖片
else:
many_url = url.replace("medium&illust","manga&illust")
print(url)
print(many_url)
self.download_many_img(img_info,many_url)

def download_img(self, img_info,url,img_ori_url):
title = img_info.find(img)[alt] # 提取標題
src = img_ori_url # 提取圖片位置
src_headers = self.headers
src_headers[Referer] = url # 增加一個referer,否則會403,referer就像上面登陸一樣找
try:
html = requests.get(src, headers=src_headers)
img = html.content
except: # 有時候會發生錯誤導致不能獲取圖片.直接跳過這張圖吧
print(獲取該圖片失敗)
return False

title = title.replace(?, _).replace(/, _).replace(\, _).replace(*, _).replace(|, _)
.replace(>, _).replace(<, _).replace(:, _).replace(", _).strip()
# 去掉那些不能在文件名裡面的.記得加上strip()去掉換行
print(正在保存名字排行第{}的圖片.format(self.rank))
with open("{}.".format(self.rank) + title + .jpg, ab) as f: # 圖片要用b
f.write(img)
print(保存該圖片完畢)
def download_many_img(self, img_info,many_url):
src_headers = self.headers
src_headers[Referer] = many_url # 增加一個referer,否則會403,referer就像上面登陸一樣找
html = requests.get(many_url, headers=src_headers)
soup = BeautifulSoup(html.content, lxml)
total = soup.find(span, attrs={class, total}).get_text()
title = soup.find("title").get_text()
title = title.replace(?, _).replace(/, _).replace(\, _).replace(*, _).replace(|, _)
.replace(>, _).replace(<, _).replace(:, _).replace(", _).strip()
# 去掉那些不能在文件名裡面的.記得加上strip()去掉換行
img_many = soup.find_all(img, attrs={class, image ui-scroll-view})
for x in range(eval(total)):
img_many_url = re.findall(r"data-src="(.+?)"",str(img_many),re.I)[x]
print(正在保存名字排行第{}的圖片第{}張.format(self.rank,x+1))
src_headers = self.headers
src_headers[Referer] = img_many_url # 增加一個referer,否則會403,referer就像上面登陸一樣找
html = requests.get(img_many_url,headers=src_headers)
img = html.content
with open("{}.".format(self.rank) + title +"{}".format(x+1)+ .jpg, ab) as f: # 圖片要用b
f.write(img)
print(保存該圖片完畢)

def work(self):
self.login()
self.get_url()

pix=Pixiv()
pix.work()

推薦閱讀:

TAG:生活 |