python爬蟲模擬登入知乎後,獲取不到首頁內容?

__author__ = "JYC103"
#coding:utf-8
import requests,re
import configparser
import urllib,urllib2
from bs4 import BeautifulSoup

def Login_Zhihu(email,password):
zhihu_login = r"http://知乎 - 與世界分享你的知識、經驗和見解"
f=requests.get("知乎 - 與世界分享你的知識、經驗和見解")
f.encoding="utf-8"
soup = BeautifulSoup(f.text)
xsrf = (soup.find("input",{"name":"_xsrf"})["value"])

headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8",
"Connection": "keep-alive",
"Host":"知乎 - 與世界分享你的知識、經驗和見解",
"Referer":"知乎 - 與世界分享你的知識、經驗和見解",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"
}

loginfo = {"_xsrf":xsrf,
"email":email,
"password":password,
"rememberme":"y"}

zhihu_session = requests.session()

login = zhihu_session.post(zhihu_login,
headers=headers,
data=loginfo,
timeout=10)

if login.status_code == 200:
print "Login Success"

else:
print login.status_code

def Get_Zhuanlan():
zhuanlan_url="知乎 - 與世界分享你的知識、經驗和見解"
get_zhuanlan_reslut=requests.get(zhuanlan_url)
get_zhuanlan_reslut.encoding="utf-8"
zhuanlan_soup=BeautifulSoup(get_zhuanlan_reslut.text)
print zhuanlan_soup
#return zhuanlan_soup

if __name__=="__main__":
cf = configparser.ConfigParser()
cf.read("config.ini")
email = cf .get("info","zhihu_email")
password = cf.get("info","zhihu_password")

Login_Zhihu(email=email,password=password)
Get_Zhuanlan()

已經進行模擬登入後,在獲取首頁信息時還是獲取到了註冊登入頁面的,是我根本沒有登入上還是咋子情況的?

==============================
fanne/Github · GitHub
還是偏過程式的寫法~


跟著 @egrcc 大大跑來的

關於取不到內容的原因大大已經解釋清楚啦,應該就是登錄需要驗證碼的問題。

以下是我項目中登錄知乎(包括處理驗證碼)和 Cookies 管理部分的代碼,供你參考,希望對你有所幫助(不保證新鮮程度,如果無法登錄的話,請到 Github 查看最新代碼 ):

_Zhihu_URL = "http://www.zhihu.com"
_Login_URL = _Zhihu_URL + "/login"
_Captcha_URL_Prefix = _Zhihu_URL + "/captcha.gif?r="
_Cookies_File_Name = "cookies.json"

_session = None
_header = {"X-Requested-With": "XMLHttpRequest",
"Referer": "http://www.zhihu.com",
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64; "
"Trident/7.0; Touch; LCJB; rv:11.0)"
" like Gecko",
"Host": "www.zhihu.com"}

def get_captcha_url():
"""獲取驗證碼網址

:return: 驗證碼網址
:rtype: str
"""
return _Captcha_URL_Prefix + str(int(time.time() * 1000))

def _save_captcha(url):
global _session
r = _session.get(url)
with open("code.gif", "wb") as f:
f.write(r.content)

def login(email="", password="", captcha="", savecookies=True):
"""不使用cookies.json,手動登陸知乎

:param str email: 郵箱
:param str password: 密碼
:param str captcha: 驗證碼
:param bool savecookies: 是否要儲存cookies文件
:return: 一個二元素元祖 , 第一個元素代表是否成功(0表示成功),
如果未成功則第二個元素表示失敗原因
:rtype: (int, dict)
"""
global _session
global _header
data = {"email": email, "password": password,
"rememberme": "y", "captcha": captcha}
r = _session.post(_Login_URL, data=data)
j = r.json()
c = int(j["r"])
m = j["msg"]
if c == 0 and savecookies is True:
with open(_Cookies_File_Name, "w") as f:
json.dump(_session.cookies.get_dict(), f)
return c, m

def create_cookies():
"""創建cookies文件, 請跟隨提示操作

:return: None
:rtype: None
"""
if os.path.isfile(_Cookies_File_Name) is False:
email = input("email: ")
password = input("password: ")
url = get_captcha_url()
_save_captcha(url)
print("please check code.gif for captcha")
captcha = input("captcha: ")
code, msg = login(email, password, captcha)

if code == 0:
print("cookies file created!")
else:
print(msg)
os.remove("code.gif")
else:
print("Please delete [" + _Cookies_File_Name + "] first.")

def _init():
global _session
if _session is None:
_session = requests.session()
_session.headers.update(_header)
if os.path.isfile(_Cookies_File_Name):
with open(_Cookies_File_Name, "r") as f:
cookies_dict = json.load(f)
_session.cookies.update(cookies_dict)
else:
print("no cookies file, this may make something wrong.")
print("if you will run create_cookies or login next, "
"please ignore me.")
_session.post(_Login_URL, data={})
else:
raise Exception("call init func two times")

_init()

然後關於內容解析的部分可以參看 @egrcc 大大的zhihu-python項目,用的Python 2
egrcc/zhihu-python · GitHub

也可以看看我從大大的項目中分離出來的zhihu-py3項目,用的Python 3,鑒於 Python 2 和 3 的區別就沒有Fork而是另外起了一個項目,但是因為我是整個看過大大的代碼之後開始動手的,所以想法都差不多
7sDream/zhihu-py3 · GitHub

恩,如果你是想解析主頁數據,得到答案,答主相關信息之類的,歡迎Fork代碼,添加功能~~

啊對了對了,最近想增加一個導出專欄文章的功能,如果有興趣的話歡迎一起工作 ヽ(??▽?)ノ

目前我的TODO List,恩,也可以加上個【解析登陸用戶主頁數據】,對哪一部分有興趣都可以來一起寫代碼喲

  • 寫文檔 T^T √
  • 增加導出為markdown功能 √
  • 增加獲取答案點贊用戶,用戶關注者,用戶追隨者,收藏夾關注者,問題關注者等
  • 增加專欄類和文章類
  • 增加答案發布時間和更新時間的獲取

以上。


我的建議是註冊一個新帳號,或者把驗證碼的事情解決掉...

我在2013年註冊的賬號登錄時都需要驗證碼檢測,2014年末和2015年年初註冊的就不需要驗證碼。所以說題主可能是被這個小細節陰了一下...

其實你可以輸出返回的信息看一眼嘛...

補充:
知乎應該是把驗證碼的功能去掉了,我最近的登陸沒有一次需要使用驗證碼。
2015/4/23


你上面的代碼的url不知道對不對,被知乎給解釋成文字了。
如果url都對,可能原因是碰到了captcha驗證碼。
獲取驗證碼

def getCaptcha():
#r=1471341285051
r=(time.time()*1000)
url="http://www.zhihu.com/captcha.gif?r="+str(r)+"type=login"

image=session.get(url,headers=headers)
f=open("photo.jpg","wb")
f.write(image.content)
f.close()

其實你可以列印出來 登錄失敗的頁面內容。
print 是個不錯的debug tool哦。

附上一個自己寫過 的登錄 知乎的代碼
http://30daydo.com/article/7


可以用Selenium來做
利用Selenium來實現知乎和Bilibili的登錄
登錄成功get一下page_source就行了


建議你利用正則把驗證碼用urllib.urlretrieve下載下來然後手動輸入驗證碼,還有個驗證碼id也用正則一併獲取,添加post數據後再提交。這是我登錄豆瓣的,懶得去折騰知乎了,應該有一點參考價值。

import urllib
import urllib2
import cookielib
import re

class DB(object):
def __init__(self, email, passwd):
self.url = "http://www.douban.com/accounts/login"
self.post = {
"form_email":email,
"form_password":passwd,
"source":"index_nav"
}
cookie = cookielib.CookieJar()
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
self.response = self.opener.open(self.url, urllib.urlencode(self.post))

def login(self):
if self.response.geturl() == self.url:
print "logining..."
html = self.response.read()
reg = r"&captcha"
imglist = re.findall(reg, html)
urllib.urlretrieve(imglist[0], "captcha.jpg")
captcha = raw_input("captcha is: ")
regid = r"&"
ids = re.findall(regid, html)
self.post["captcha-solution"] = captcha
self.post["captcha-id"] = ids[0]
self.post["user_login"] = "登錄"
self.post["redir"] = "http://www.douban.com/doumail/"
self.response = self.opener.open(self.url, urllib.urlencode(self.post))
if self.response.geturl() == "http://www.douban.com/doumail/":
print "login success !"

email = raw_input("Your email: ")
passwd = raw_input("Your passwd: ")
my = DB(email, passwd)
my.login()


url = "知乎 - 與世界分享你的知識、經驗和見解"
出現下面這個東東,求解答,謝謝!
{
"r": 1,
"errcode": 1991829,

"data": {"captcha":"u9a8cu8bc1u7801u9519u8bef"},


"msg": "u9a8cu8bc1u7801u9519u8bef"

}


我也是這個問題。。。但是我用網頁登錄的時候沒有需要驗證碼的啊


url="http://www.zhihu.com/captcha.gif?r="+str(r)+"type=login" 這個網址你們怎麼發現的????


#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf8")
import sys, hashlib, os, random, urllib, urllib2
from datetime import *
import time
import gzip
import xlrd
import xml.etree.ElementTree as Etree
import cookielib
loginheaders = {
"Host": "http://cpaexam.cicpa.org.cn/",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4",
"Cookie": "cookiee=20111136; ___rl__test__cookies=1480664016733; OUTFOX_SEARCH_USER_ID_NCOO=1216748558.1867485; JSESSIONID=vgdpYB1fdfRCVZ5dhyQ3nn1YdzBJ6nqCQSvJVr61Qrk7vw8vsv3v!1505567094",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": 1,
"Cache-Control": "max-age=0",
}
post_headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4",
"Cache-Control":"max-age=0",
"Connection":"keep-alive",
"Content-Length":75,
"Content-Type":"application/x-www-form-urlencoded",
"Cookie":"OUTFOX_SEARCH_USER_ID_NCOO=1216748558.1867485; cookiee=20111135; JSESSIONID=MkyDYCrR3LXVMZKqvLdj0dp2ZhTlxRcnG2LVvLKf0Dtmx1h2KHtn!508040916",
"Host":"cpaexam.cicpa.org.cn",
"Origin":"http://cpaexam.cicpa.org.cn",
"Referer":"http://cpaexam.cicpa.org.cn/scoreshow",
"Upgrade-Insecure-Requests":1,
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"
}
hosturl = "http://cpaexam.cicpa.org.cn/"
imgurl = "http://cpaexam.cicpa.org.cn/Fri%20Dec%2002%202016%2016:09:21%20GMT+0800%20(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4).check"
#imgurl = "http://oa.gaodun.com/weaver/weaver.file.MakeValidateCode?seriesnum_=1"

class APIClient(object):
def __init__(self):
self.paramDict = {}
self.paramDict["username"] = "zhaochenglan"
self.paramDict["password"] = "123456"
self.paramDict["typeid"] = 1040
self.paramDict["timeout"] = 60
self.paramDict["softid"] = 1
self.paramDict["softkey"] = "b40ffbee5c1cf4e38028c197eb2fc751"
self.paramKeys = ["username",
"password",
"typeid",
"timeout",
"softid",
"softkey"
]

def http_request(self, url, paramDict):
post_content = ""
for key in paramDict:
post_content = post_content + "%s=%s" % (key, paramDict[key])
post_content = post_content[0:-1]
# print post_content
req = urllib2.Request(url, data=post_content)
req.add_header("Content-Type", "application/x-www-form-urlencoded")
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
response = opener.open(req, post_content)
return response.read()

def http_upload_image(self, url, filebytes):
timestr = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
boundary = "------------" + hashlib.md5(timestr).hexdigest().lower()
boundarystr = "
--%s
" % (boundary)

bs = b""
for key in self.paramKeys:
bs = bs + boundarystr.encode("ascii")
param = "Content-Disposition: form-data; name="%s"

%s" % (key, self.paramDict[key])
# print param
bs = bs + param.encode("utf8")
bs = bs + boundarystr.encode("ascii")

header = "Content-Disposition: form-data; name="image"; filename="%s"
Content-Type: image/gif

" % (
"sample")
bs = bs + header.encode("utf8")

bs = bs + filebytes
tailer = "
--%s--
" % (boundary)
bs = bs + tailer.encode("ascii")

import requests

headers = {"Content-Type": "multipart/form-data; boundary=%s" % boundary,
"Connection": "Keep-Alive",
"Expect": "100-continue",
}
response = requests.post(url, params="", data=bs, headers=headers)
print response.text
notify_data_tree = Etree.fromstring(response.text)
result = notify_data_tree.find("Result").text
return result

def arguments_to_dict(args):
argDict = {}
if args is None:
return argDict

count = len(args)
if count &<= 1: print "exit:need arguments." return argDict for i in [1, count - 1]: pair = args[i].split("=") if len(pair) &< 2: continue else: argDict[pair[0]] = pair[1] return argDict def chaxun(name, id_card, chaxun_url,imgurl,img_name): cookiejar = cookielib.LWPCookieJar() cookieSupport= urllib2.HTTPCookieProcessor(cookiejar) opener = urllib2.build_opener(cookieSupport, urllib2.HTTPHandler) #urllib2.install_opener(opener) code = get_code(imgurl, img_name,opener) #code = raw_input("code:") #print "驗證碼是:", code #code = raw_input() ChaxunData =urllib.urlencode({ "annual":2016, "name": name.encode("gbk"), "idCard": id_card, "validate": code }) print ChaxunData #response = opener.open(chaxun_url, ChaxunData) #print response.read() c = urllib2.Request(chaxun_url, ChaxunData,headers=loginheaders) print opener.open(c).read().decode("gbk") #data = urllib2.urlopen(c).read().decode("gbk") #print data def get_code(imgurl, img_name,opener): #data = urllib2.urlopen(imgurl).read()#.decode("gbk") r = urllib2.Request(imgurl, headers=loginheaders) #data = urllib2.urlopen(r).read() f = open(img_name, "wb") f.write(opener.open(r).read()) f.close() result = "" filebytes = open(img_name, "rb").read() client = APIClient() result = client.http_upload_image("http://api.ysdm.net/create.xml", filebytes) return result def main(name,id_card,chaxun_url,imgurl,img_name): chaxun(name,id_card,chaxun_url,imgurl,img_name) if __name__ == "__main__": main(u"酈文崢","110101198408050532",imgurl, img_name="22.jpg",chaxun_url="http://cpaexam.cicpa.org.cn/scoreshow")

我寫的一個爬取CPA成績的腳本,但是post的時候取不到數據,求解,非常感謝


最後你解決了嘛,我現在也遇到這個問題了?煩請指點一下


推薦閱讀:

Python中用BeautifulSoup解析中文網頁里的中文都是亂碼,怎麼回事?
python中selenium下如何獲取網頁中對應標籤的文本信息?
每月一萬元,每天監控一千萬個網頁的變化,有什麼可能的解決方案?
關於python中beautifulsoup的問題?
能否用爬蟲抓取論文參考文獻?

TAG:Python | 爬蟲計算機網路 |