python爬蟲模擬登入知乎後，獲取不到首頁內容？

12-28

__author__ = "JYC103" #coding:utf-8 import requests,re import configparser import urllib,urllib2 from bs4 import BeautifulSoup
def Login_Zhihu(email,password): zhihu_login = r"http://知乎 - 與世界分享你的知識、經驗和見解" f=requests.get("知乎 - 與世界分享你的知識、經驗和見解") f.encoding="utf-8" soup = BeautifulSoup(f.text) xsrf = (soup.find("input",{"name":"_xsrf"})["value"]) headers = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "keep-alive", "Host":"知乎 - 與世界分享你的知識、經驗和見解", "Referer":"知乎 - 與世界分享你的知識、經驗和見解", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36" } loginfo = {"_xsrf":xsrf, "email":email, "password":password, "rememberme":"y"} zhihu_session = requests.session() login = zhihu_session.post(zhihu_login, headers=headers, data=loginfo, timeout=10) if login.status_code == 200: print "Login Success" else: print login.status_code def Get_Zhuanlan(): zhuanlan_url="知乎 - 與世界分享你的知識、經驗和見解" get_zhuanlan_reslut=requests.get(zhuanlan_url) get_zhuanlan_reslut.encoding="utf-8" zhuanlan_soup=BeautifulSoup(get_zhuanlan_reslut.text) print zhuanlan_soup #return zhuanlan_soup if __name__=="__main__": cf = configparser.ConfigParser() cf.read("config.ini") email = cf .get("info","zhihu_email") password = cf.get("info","zhihu_password")
Login_Zhihu(email=email,password=password) Get_Zhuanlan()
已經進行模擬登入後，在獲取首頁信息時還是獲取到了註冊登入頁面的，是我根本沒有登入上還是咋子情況的？

==============================
fanne/Github · GitHub
還是偏過程式的寫法~

跟著 @egrcc 大大跑來的

關於取不到內容的原因大大已經解釋清楚啦，應該就是登錄需要驗證碼的問題。

以下是我項目中登錄知乎（包括處理驗證碼）和 Cookies 管理部分的代碼，供你參考，希望對你有所幫助（不保證新鮮程度，如果無法登錄的話，請到 Github 查看最新代碼）：

_Zhihu_URL = "http://www.zhihu.com" _Login_URL = _Zhihu_URL + "/login" _Captcha_URL_Prefix = _Zhihu_URL + "/captcha.gif?r=" _Cookies_File_Name = "cookies.json"


_session = None

_header = {"X-Requested-With": "XMLHttpRequest",

           "Referer": "http://www.zhihu.com",

           "User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64; "

                         "Trident/7.0; Touch; LCJB; rv:11.0)"

                         " like Gecko",

           "Host": "www.zhihu.com"}
def get_captcha_url():

    """獲取驗證碼網址
    :return: 驗證碼網址

    :rtype: str

    """

    return _Captcha_URL_Prefix + str(int(time.time() * 1000))
def _save_captcha(url):

    global _session

    r = _session.get(url)

    with open("code.gif", "wb") as f:

        f.write(r.content)
def login(email="", password="", captcha="", savecookies=True):

    """不使用cookies.json，手動登陸知乎
    :param str email: 郵箱

    :param str password: 密碼

    :param str captcha: 驗證碼

    :param bool savecookies: 是否要儲存cookies文件

    :return: 一個二元素元祖 , 第一個元素代表是否成功（0表示成功），

        如果未成功則第二個元素表示失敗原因

    :rtype: (int, dict)

    """

    global _session

    global _header

    data = {"email": email, "password": password,

            "rememberme": "y", "captcha": captcha}

    r = _session.post(_Login_URL, data=data)

    j = r.json()

    c = int(j["r"])

    m = j["msg"]

    if c == 0 and savecookies is True:

        with open(_Cookies_File_Name, "w") as f:

            json.dump(_session.cookies.get_dict(), f)

    return c, m
def create_cookies():

    """創建cookies文件, 請跟隨提示操作
    :return: None

    :rtype: None

    """

    if os.path.isfile(_Cookies_File_Name) is False:

        email = input("email: ")

        password = input("password: ")

        url = get_captcha_url()

        _save_captcha(url)

        print("please check code.gif for captcha")

        captcha = input("captcha: ")

        code, msg = login(email, password, captcha)
        if code == 0:

            print("cookies file created!")

        else:

            print(msg)

        os.remove("code.gif")

    else:

        print("Please delete [" + _Cookies_File_Name + "] first.")
def _init():

    global _session

    if _session is None:

        _session = requests.session()

        _session.headers.update(_header)

        if os.path.isfile(_Cookies_File_Name):

            with open(_Cookies_File_Name, "r") as f:

                cookies_dict = json.load(f)

                _session.cookies.update(cookies_dict)

        else:

            print("no cookies file, this may make something wrong.")

            print("if you will run create_cookies or login next, "

                  "please ignore me.")

            _session.post(_Login_URL, data={})

    else:

        raise Exception("call init func two times")

_init()

然後關於內容解析的部分可以參看 @egrcc 大大的zhihu-python項目，用的Python 2
egrcc/zhihu-python · GitHub

也可以看看我從大大的項目中分離出來的zhihu-py3項目，用的Python 3，鑒於 Python 2 和 3 的區別就沒有Fork而是另外起了一個項目，但是因為我是整個看過大大的代碼之後開始動手的，所以想法都差不多
7sDream/zhihu-py3 · GitHub

恩，如果你是想解析主頁數據，得到答案，答主相關信息之類的，歡迎Fork代碼，添加功能~~

啊對了對了，最近想增加一個導出專欄文章的功能，如果有興趣的話歡迎一起工作ヽ(??▽?)ノ

目前我的TODO List，恩，也可以加上個【解析登陸用戶主頁數據】，對哪一部分有興趣都可以來一起寫代碼喲

寫文檔 T^T √
增加導出為markdown功能 √
增加獲取答案點贊用戶，用戶關注者，用戶追隨者，收藏夾關注者，問題關注者等
增加專欄類和文章類
增加答案發布時間和更新時間的獲取

以上。

我的建議是註冊一個新帳號，或者把驗證碼的事情解決掉...

我在2013年註冊的賬號登錄時都需要驗證碼檢測，2014年末和2015年年初註冊的就不需要驗證碼。所以說題主可能是被這個小細節陰了一下...

其實你可以輸出返回的信息看一眼嘛...

補充：
知乎應該是把驗證碼的功能去掉了，我最近的登陸沒有一次需要使用驗證碼。
2015/4/23

你上面的代碼的url不知道對不對，被知乎給解釋成文字了。
如果url都對，可能原因是碰到了captcha驗證碼。
獲取驗證碼

def getCaptcha(): #r=1471341285051 r=(time.time()*1000) url="http://www.zhihu.com/captcha.gif?r="+str(r)+"type=login"

image=session.get(url,headers=headers) f=open("photo.jpg","wb") f.write(image.content) f.close()

其實你可以列印出來登錄失敗的頁面內容。
print 是個不錯的debug tool哦。

附上一個自己寫過的登錄知乎的代碼
http://30daydo.com/article/7

可以用Selenium來做
利用Selenium來實現知乎和Bilibili的登錄
登錄成功get一下page_source就行了

建議你利用正則把驗證碼用urllib.urlretrieve下載下來然後手動輸入驗證碼，還有個驗證碼id也用正則一併獲取，添加post數據後再提交。這是我登錄豆瓣的，懶得去折騰知乎了，應該有一點參考價值。

import urllib import urllib2 import cookielib import re


class DB(object):

    def __init__(self, email, passwd):

        self.url = "http://www.douban.com/accounts/login"

        self.post = {

            "form_email":email,

            "form_password":passwd,

            "source":"index_nav"

            }

        cookie = cookielib.CookieJar()

        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))

        self.response = self.opener.open(self.url, urllib.urlencode(self.post))
    def login(self):

        if self.response.geturl() == self.url:

            print "logining..."

            html = self.response.read()

            reg = r"&"

            imglist = re.findall(reg, html)

            urllib.urlretrieve(imglist[0], "captcha.jpg")

            captcha = raw_input("captcha is: ")

            regid = r"&"

            ids = re.findall(regid, html)

            self.post["captcha-solution"] = captcha

            self.post["captcha-id"] = ids[0]

            self.post["user_login"] = "登錄"

            self.post["redir"] = "http://www.douban.com/doumail/"

            self.response = self.opener.open(self.url, urllib.urlencode(self.post))

            if self.response.geturl() == "http://www.douban.com/doumail/":

                print "login success !"

email = raw_input("Your email: ") passwd = raw_input("Your passwd: ") my = DB(email, passwd) my.login()

url = "知乎 - 與世界分享你的知識、經驗和見解"
出現下面這個東東，求解答，謝謝！
{
"r": 1,
"errcode": 1991829,

"data": {"captcha":"u9a8cu8bc1u7801u9519u8bef"},

"msg": "u9a8cu8bc1u7801u9519u8bef"

}

我也是這個問題。。。但是我用網頁登錄的時候沒有需要驗證碼的啊

url="http://www.zhihu.com/captcha.gif?r="+str(r)+"type=login" 這個網址你們怎麼發現的？？？？

#!/usr/bin/env python # -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding("utf8") import sys, hashlib, os, random, urllib, urllib2 from datetime import * import time import gzip import xlrd import xml.etree.ElementTree as Etree import cookielib loginheaders = { "Host": "http://cpaexam.cicpa.org.cn/", "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4", "Cookie": "cookiee=20111136; ___rl__test__cookies=1480664016733; OUTFOX_SEARCH_USER_ID_NCOO=1216748558.1867485; JSESSIONID=vgdpYB1fdfRCVZ5dhyQ3nn1YdzBJ6nqCQSvJVr61Qrk7vw8vsv3v!1505567094", "Connection": "keep-alive", "Upgrade-Insecure-Requests": 1, "Cache-Control": "max-age=0", } post_headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate", "Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4", "Cache-Control":"max-age=0", "Connection":"keep-alive", "Content-Length":75, "Content-Type":"application/x-www-form-urlencoded", "Cookie":"OUTFOX_SEARCH_USER_ID_NCOO=1216748558.1867485; cookiee=20111135; JSESSIONID=MkyDYCrR3LXVMZKqvLdj0dp2ZhTlxRcnG2LVvLKf0Dtmx1h2KHtn!508040916", "Host":"cpaexam.cicpa.org.cn", "Origin":"http://cpaexam.cicpa.org.cn", "Referer":"http://cpaexam.cicpa.org.cn/scoreshow", "Upgrade-Insecure-Requests":1, "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36" } hosturl = "http://cpaexam.cicpa.org.cn/" imgurl = "http://cpaexam.cicpa.org.cn/Fri%20Dec%2002%202016%2016:09:21%20GMT+0800%20(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4).check" #imgurl = "http://oa.gaodun.com/weaver/weaver.file.MakeValidateCode?seriesnum_=1"


class APIClient(object):

    def __init__(self):

        self.paramDict = {}

        self.paramDict["username"] = "zhaochenglan"

        self.paramDict["password"] = "123456"

        self.paramDict["typeid"] = 1040

        self.paramDict["timeout"] = 60

        self.paramDict["softid"] = 1

        self.paramDict["softkey"] = "b40ffbee5c1cf4e38028c197eb2fc751"

        self.paramKeys = ["username",

                          "password",

                          "typeid",

                          "timeout",

                          "softid",

                          "softkey"

                          ]
    def http_request(self, url, paramDict):

        post_content = ""

        for key in paramDict:

            post_content = post_content + "%s=%s" % (key, paramDict[key])

        post_content = post_content[0:-1]

        # print post_content

        req = urllib2.Request(url, data=post_content)

        req.add_header("Content-Type", "application/x-www-form-urlencoded")

        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())

        response = opener.open(req, post_content)

        return response.read()
    def http_upload_image(self, url, filebytes):

        timestr = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        boundary = "------------" + hashlib.md5(timestr).hexdigest().lower()

        boundarystr = "

--%s

" % (boundary)
        bs = b""

        for key in self.paramKeys:

            bs = bs + boundarystr.encode("ascii")

            param = "Content-Disposition: form-data; name="%s"
%s" % (key, self.paramDict[key])

            # print param

            bs = bs + param.encode("utf8")

        bs = bs + boundarystr.encode("ascii")
        header = "Content-Disposition: form-data; name="image"; filename="%s"

Content-Type: image/gif
" % (

            "sample")

        bs = bs + header.encode("utf8")
        bs = bs + filebytes

        tailer = "

--%s--

" % (boundary)

        bs = bs + tailer.encode("ascii")
        import requests
        headers = {"Content-Type": "multipart/form-data; boundary=%s" % boundary,

                   "Connection": "Keep-Alive",

                   "Expect": "100-continue",

                   }

        response = requests.post(url, params="", data=bs, headers=headers)

        print response.text

        notify_data_tree = Etree.fromstring(response.text)

        result = notify_data_tree.find("Result").text

        return result
def arguments_to_dict(args):

    argDict = {}

    if args is None:

        return argDict

count = len(args) if count &<= 1: print "exit:need arguments." return argDict for i in [1, count - 1]: pair = args[i].split("=") if len(pair) &< 2: continue else: argDict[pair[0]] = pair[1] return argDict def chaxun(name, id_card, chaxun_url,imgurl,img_name): cookiejar = cookielib.LWPCookieJar() cookieSupport= urllib2.HTTPCookieProcessor(cookiejar) opener = urllib2.build_opener(cookieSupport, urllib2.HTTPHandler) #urllib2.install_opener(opener) code = get_code(imgurl, img_name,opener) #code = raw_input("code:") #print "驗證碼是：", code #code = raw_input() ChaxunData =urllib.urlencode({ "annual":2016, "name": name.encode("gbk"), "idCard": id_card, "validate": code }) print ChaxunData #response = opener.open(chaxun_url, ChaxunData) #print response.read() c = urllib2.Request(chaxun_url, ChaxunData,headers=loginheaders) print opener.open(c).read().decode("gbk") #data = urllib2.urlopen(c).read().decode("gbk") #print data def get_code(imgurl, img_name,opener): #data = urllib2.urlopen(imgurl).read()#.decode("gbk") r = urllib2.Request(imgurl, headers=loginheaders) #data = urllib2.urlopen(r).read() f = open(img_name, "wb") f.write(opener.open(r).read()) f.close() result = "" filebytes = open(img_name, "rb").read() client = APIClient() result = client.http_upload_image("http://api.ysdm.net/create.xml", filebytes) return result def main(name,id_card,chaxun_url,imgurl,img_name): chaxun(name,id_card,chaxun_url,imgurl,img_name) if __name__ == "__main__": main(u"酈文崢","110101198408050532",imgurl, img_name="22.jpg",chaxun_url="http://cpaexam.cicpa.org.cn/scoreshow")

我寫的一個爬取CPA成績的腳本，但是post的時候取不到數據，求解，非常感謝

最後你解決了嘛，我現在也遇到這個問題了？煩請指點一下