python模擬登錄知乎，captcha是手工輸入，為什麼也提示captcha錯了？

12-30

#!/usr/bin/python
import urllib
import urllib2
import cookielib

import re
import time
hosturl = "知乎 - 與世界分享你的知識、經驗和見解"
posturl = "知乎 - 與世界分享你的知識、經驗和見解"
captcha_pre = "http://www.zhihu.com/captcha.gif?r="
#set cookie
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)

opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
#get xsrf
h = urllib2.urlopen(hosturl)
html = h.read()
xsrf_str = r"&"
xsrf = re.findall(xsrf_str, html)[0]
print xsrf

#get captcha
captchaurl = captcha_pre + str(int(time.time() * 1000))
print captchaurl
urllib.urlretrieve(captchaurl, "captcha.jpg")
captcha = raw_input("captcha is: ")
print captcha
#post data
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1",
"Referer" : "http:知乎 - 與世界分享你的知識、經驗和見解"}

postData = {"_xsrf" : xsrf,
"password" : "wwww",
"captcha" : captcha,
"email" : "xxx",
"remember_me" : "true",
}
#request
postData = urllib.urlencode(postData)
print postData

request = urllib2.Request(posturl, postData, headers)
response = urllib2.urlopen(request)
text = response.read()
print text
返回
{
"r": 1,
"errcode": 1991829,

"data": {"captcha":"u9a8cu8bc1u7801u9519u8bef"},
"msg": "u9a8cu8bc1u7801u9519u8bef"
}

問題已經解決：

具體是卡在這裡：

1是為了圖方便，下載圖片用了urllib.urlretrieve，可是cookie時綁定到urllib2上的，所以獲取cookie失敗

2是發現登錄知乎必須對login/email介面提交兩次，第一次提交即使captcha對了也不行，必須兩次，太坑了

3就算是登錄成功了，再去拉主頁，還是提示沒登錄，原來get 主頁也要設置headers，不然還是說沒登錄上

修改如下:

#!/usr/bin/python


import urllib

import urllib2

import cookielib

import re

import time
hosturl = "http://www.zhihu.com"

posturl = "http://www.zhihu.com/login/email"

captcha_pre = "http://www.zhihu.com/captcha.gif?r="
#set cookie

cj = cookielib.CookieJar()

cookie_support = urllib2.HTTPCookieProcessor(cj)

opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)

urllib2.install_opener(opener)
#get xsrf

h = urllib2.urlopen(hosturl)

html = h.read()

xsrf_str = r"&"

xsrf = re.findall(xsrf_str, html)[0]

print xsrf
#get captcha

def get_captcha():

    captchaurl = captcha_pre + str(int(time.time() * 1000))

    print captchaurl

    data = urllib2.urlopen(captchaurl).read()

    f = file("captcha.jpg","wb")

    f.write(data)

    f.close()

    captcha = raw_input("captcha is: ")

    print captcha

    return captcha
#post data

def post_data(captcha,xsrf):

    headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1",

               "Referer" : "http:www.zhihu.com"}

    postData = {"_xsrf" : xsrf,

                "password" : "yyy",

                "captcha" : captcha,

                "email" : "xxx",

                "remember_me" : "true",

                }
    #request

    postData = urllib.urlencode(postData)

    print postData

    request = urllib2.Request(posturl, postData, headers)

    response = urllib2.urlopen(request)

    text = response.read()

    return text
#post it

captcha=get_captcha()

print captcha

text=post_data(captcha,xsrf)

print text
#post again

captcha=get_captcha()

text=post_data(captcha,xsrf)

print text

headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1", "Referer" : "http:www.zhihu.com"} request = urllib2.Request(url="http://www.zhihu.com", headers=headers) response = urllib2.urlopen(request) print response.read()