python模擬登錄知乎,captcha是手工輸入,為什麼也提示captcha錯了?

#!/usr/bin/python

import urllib

import urllib2

import cookielib

import re

import time

hosturl = "知乎 - 與世界分享你的知識、經驗和見解"

posturl = "知乎 - 與世界分享你的知識、經驗和見解"

captcha_pre = "http://www.zhihu.com/captcha.gif?r="

#set cookie

cj = cookielib.LWPCookieJar()

cookie_support = urllib2.HTTPCookieProcessor(cj)

opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)

urllib2.install_opener(opener)

#get xsrf

h = urllib2.urlopen(hosturl)

html = h.read()

xsrf_str = r"&"

xsrf = re.findall(xsrf_str, html)[0]

print xsrf

#get captcha

captchaurl = captcha_pre + str(int(time.time() * 1000))

print captchaurl

urllib.urlretrieve(captchaurl, "captcha.jpg")

captcha = raw_input("captcha is: ")

print captcha

#post data

headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1",

"Referer" : "http:知乎 - 與世界分享你的知識、經驗和見解"}

postData = {"_xsrf" : xsrf,

"password" : "wwww",

"captcha" : captcha,

"email" : "xxx",

"remember_me" : "true",

}

#request

postData = urllib.urlencode(postData)

print postData

request = urllib2.Request(posturl, postData, headers)

response = urllib2.urlopen(request)

text = response.read()

print text

返回

{

"r": 1,

"errcode": 1991829,

"data": {"captcha":"u9a8cu8bc1u7801u9519u8bef"},

"msg": "u9a8cu8bc1u7801u9519u8bef"

}


問題已經解決:

具體是卡在這裡:

1是為了圖方便,下載圖片用了urllib.urlretrieve, 可是cookie時綁定到urllib2上的,所以獲取cookie失敗

2是發現登錄知乎必須對login/email介面提交兩次,第一次提交即使captcha對了也不行,必須兩次,太坑了

3就算是登錄成功了,再去拉主頁,還是提示沒登錄,原來get 主頁也要設置headers,不然還是說沒登錄上

修改如下:

#!/usr/bin/python

import urllib
import urllib2
import cookielib
import re
import time

hosturl = "http://www.zhihu.com"
posturl = "http://www.zhihu.com/login/email"
captcha_pre = "http://www.zhihu.com/captcha.gif?r="

#set cookie
cj = cookielib.CookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)

#get xsrf
h = urllib2.urlopen(hosturl)
html = h.read()
xsrf_str = r"&"
xsrf = re.findall(xsrf_str, html)[0]
print xsrf

#get captcha
def get_captcha():
captchaurl = captcha_pre + str(int(time.time() * 1000))
print captchaurl
data = urllib2.urlopen(captchaurl).read()
f = file("captcha.jpg","wb")
f.write(data)
f.close()
captcha = raw_input("captcha is: ")
print captcha
return captcha

#post data
def post_data(captcha,xsrf):
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1",
"Referer" : "http:www.zhihu.com"}
postData = {"_xsrf" : xsrf,
"password" : "yyy",
"captcha" : captcha,
"email" : "xxx",
"remember_me" : "true",
}

#request
postData = urllib.urlencode(postData)
print postData
request = urllib2.Request(posturl, postData, headers)
response = urllib2.urlopen(request)
text = response.read()
return text

#post it
captcha=get_captcha()
print captcha
text=post_data(captcha,xsrf)
print text

#post again
captcha=get_captcha()
text=post_data(captcha,xsrf)
print text

headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1",
"Referer" : "http:www.zhihu.com"}
request = urllib2.Request(url="http://www.zhihu.com", headers=headers)
response = urllib2.urlopen(request)
print response.read()


推薦閱讀:

如何解析網頁視頻的原始地址?
anaconda是幹什麼的,是 python的第三方解釋環境嗎?
Chrome的開發者工具怎麼查看錶單數據?網路選項卡里的參數一項在哪裡?
Python輸出豆瓣個人主頁源代碼出現編碼錯誤?

TAG:Python | 爬蟲計算機網路 |