使用python爬取pixiv.net的圖片？

12-28

像http://pixiv.net這種網頁怎麼爬？發送request，urlopen之後返回400？
# -*- coding:utf8 -*- import urllib2,urllib
""" url2="https://www.secure.pixiv.net/login.php" user_agent ="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2508.0 Safari/537.36 OPR/34.0.2026.0 (Edition developer)" header={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "#Accept-Encoding":"gzip, deflate, lzma", "Accept-Language":"zh-CN,zh;q=0.8", "Cache-Control":"max-age=0", "Connection":"keep-alive", "Content-Length":"55", "Content-Type":"application/x-www-form-urlencoded", "Host":"http://www.secure.pixiv.net", "Origin":"https://http://www.secure.pixiv.net", "Referer":"https://http://www.secure.pixiv.net/login.php", "Upgrade-Insecure-Requests":"1", "User-Agent":user_agent } values={"mode":"login","return_to":"/","493377211":"userName","pass":"qw123456","skip":"1"} postdata=urllib.urlencode(values) req=urllib2.Request(url2,postdata,header) res=urllib2.urlopen(req) xmhtml=res.read()
以上代碼是使用py模擬登陸的過程
包含一個本人測試的新註冊的號，供各位大神測試使用，請不要修改密碼~
還有直接請求一個頁面也是返回400
url2="http://www.pixiv.net/ranking_area.php?type=detailno=6" user_agent ="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2508.0 Safari/537.36 OPR/34.0.2026.0 (Edition developer)" header={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "#Accept-Encoding":"gzip, deflate, lzma", "Accept-Language":"zh-CN,zh;q=0.8", "Cache-Control":"max-age=0", "Connection":"keep-alive", "Content-Length":"55", "Content-Type":"application/x-www-form-urlencoded", "Host":"http://www.pixiv.net", "Upgrade-Insecure-Requests":"1", "User-Agent":user_agent } req=urllib2.Request(url2,headers=header) res=urllib2.urlopen(req) xmhtml=res.read()

你提交的數據寫錯了
以下代碼登陸成功:

#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: LostInNight # @Date: 2015-10-28 19:59:24 # @Last Modified by: LostInNight # @Last Modified time: 2015-10-29 01:11:22


import requests
login_url = r"https://www.secure.pixiv.net/login.php"

data = {

	"mode":"login",

	"return_to":"/",

	"pixiv_id":"493377211",

	"pass":"qw123456",

	"skip":"1"

}

s = requests.Session() res = s.post(login_url, data = data) print(res.status_code) with open(r"F:html.html", "wb") as f: f.write(html)

參考了一下深海魚前輩的代碼,受益匪淺,講真我不知道那個secure是怎麼抓到的=w=
但是這樣子好像僅僅完成了一半登陸吧?
我補充了下讓其能夠正常獲取http://www.pixiv.net/的主頁
初學python寫的有點亂,求別噴

import requests import urllib import httplib2


login_url = r"https://www.secure.pixiv.net/login.php"
v_login_body = {

    "mode": "login",

    "return_to": "/",

    "pixiv_id": "YourIdHere",

    "pass": "YourPwdHere",

    "skip": "1"

}
def login_body(username=None, passwd=None):

    t_login_body = v_login_body

    if not username or not passwd:

        print "Error in Login(Username or password cannot be empty)"

        return 0

    t_login_body["pixiv_id"] = username

    t_login_body["pass"] = passwd

    return t_login_body
baseHeader = {

    # "Host": "www.pixiv.net"

    "Connection": "keep-alive",

    "User-Agent": r"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",

    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",

    # "Referer": "",

    "Accept-Encoding": "gzip, deflate",

    "Accept-Language": "zh-CN,zh;q=0.8",

}
s = requests.Session()

res = s.post(login_url, data=login_body("493377211", "qw123456"))

print(res.status_code)

f = open("a.html", "w")
print s.cookies
print(res.cookies["PHPSESSID"])
l = r"https://www.pixiv.net/login.php"
def request(url, body=None, header=None, method="GET"):

    try:

        if body:

            body = urllib.urlencode(body)

        resp, content = httplib2.Http(timeout=20).request(url, method=method, headers=header, body=body)

        return resp, content

    except KeyError:

        return 0, 0
hd = baseHeader

hd["Cookie"] = res.cookies["PHPSESSID"]

hd["Referer"] = login_url

print hd
resp, content = request(l, body=login_body("493377211", "qw123456"), header=hd, method="POST")
cookies = resp["set-cookie"]

hd["Cookie"] = cookies.split()[8] pixivlink = r"http://www.pixiv.net/" resp, content = request(pixivlink, header=hd) print resp f.write(content)

抄了深海魚前輩的代碼,還請海涵

我寫過一個…不過寫的不太好
python爬蟲學習--pixiv爬蟲(1)--p站爬蟲的登錄 http://t.cn/RqNSvVY
python爬蟲學習--pixiv爬蟲(2)--國際排行榜的圖片爬取 http://t.cn/RqNStGe
python爬蟲學習--pixiv爬蟲(3)--關注用戶作品爬取 http://t.cn/RqNSIK2

p站改版後增加了反爬蟲機制首先在登陸界面會發送一個postkey