python模擬登錄知乎,captcha是手工輸入,為什麼也提示captcha錯了?
#!/usr/bin/python
import urllibimport urllib2import cookielibimport re
import timehosturl = "知乎 - 與世界分享你的知識、經驗和見解"posturl = "知乎 - 與世界分享你的知識、經驗和見解"captcha_pre = "http://www.zhihu.com/captcha.gif?r="#set cookiecj = cookielib.LWPCookieJar()cookie_support = urllib2.HTTPCookieProcessor(cj)opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)#get xsrfh = urllib2.urlopen(hosturl)html = h.read()xsrf_str = r"&"xsrf = re.findall(xsrf_str, html)[0]print xsrf#get captcha
captchaurl = captcha_pre + str(int(time.time() * 1000))print captchaurlurllib.urlretrieve(captchaurl, "captcha.jpg")captcha = raw_input("captcha is: ")print captcha#post dataheaders = {"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1", "Referer" : "http:知乎 - 與世界分享你的知識、經驗和見解"}postData = {"_xsrf" : xsrf,
"password" : "wwww", "captcha" : captcha, "email" : "xxx", "remember_me" : "true", }#requestpostData = urllib.urlencode(postData)print postDatarequest = urllib2.Request(posturl, postData, headers)
response = urllib2.urlopen(request)text = response.read()print text返回{ "r": 1, "errcode": 1991829,"data": {"captcha":"u9a8cu8bc1u7801u9519u8bef"},
"msg": "u9a8cu8bc1u7801u9519u8bef"}
問題已經解決:
具體是卡在這裡:1是為了圖方便,下載圖片用了urllib.urlretrieve, 可是cookie時綁定到urllib2上的,所以獲取cookie失敗2是發現登錄知乎必須對login/email介面提交兩次,第一次提交即使captcha對了也不行,必須兩次,太坑了3就算是登錄成功了,再去拉主頁,還是提示沒登錄,原來get 主頁也要設置headers,不然還是說沒登錄上修改如下:
#!/usr/bin/python
import urllib
import urllib2
import cookielib
import re
import time
hosturl = "http://www.zhihu.com"
posturl = "http://www.zhihu.com/login/email"
captcha_pre = "http://www.zhihu.com/captcha.gif?r="
#set cookie
cj = cookielib.CookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
#get xsrf
h = urllib2.urlopen(hosturl)
html = h.read()
xsrf_str = r"&"
xsrf = re.findall(xsrf_str, html)[0]
print xsrf
#get captcha
def get_captcha():
captchaurl = captcha_pre + str(int(time.time() * 1000))
print captchaurl
data = urllib2.urlopen(captchaurl).read()
f = file("captcha.jpg","wb")
f.write(data)
f.close()
captcha = raw_input("captcha is: ")
print captcha
return captcha
#post data
def post_data(captcha,xsrf):
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1",
"Referer" : "http:www.zhihu.com"}
postData = {"_xsrf" : xsrf,
"password" : "yyy",
"captcha" : captcha,
"email" : "xxx",
"remember_me" : "true",
}
#request
postData = urllib.urlencode(postData)
print postData
request = urllib2.Request(posturl, postData, headers)
response = urllib2.urlopen(request)
text = response.read()
return text
#post it
captcha=get_captcha()
print captcha
text=post_data(captcha,xsrf)
print text
#post again
captcha=get_captcha()
text=post_data(captcha,xsrf)
print text
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1",
"Referer" : "http:www.zhihu.com"}
request = urllib2.Request(url="http://www.zhihu.com", headers=headers)
response = urllib2.urlopen(request)
print response.read()
推薦閱讀:
※如何解析網頁視頻的原始地址?
※anaconda是幹什麼的,是 python的第三方解釋環境嗎?
※Chrome的開發者工具怎麼查看錶單數據?網路選項卡里的參數一項在哪裡?
※Python輸出豆瓣個人主頁源代碼出現編碼錯誤?