爬糗事百科段子，代碼運行成功，但沒有返回結果，對自己寫的代碼有些疑問，有會的可以指點一下嗎？

12-28

代碼部分：
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
import requests
from bs4 import BeautifulSoup
DOWNLOAD_URL="https://www.qiushibaike.com/text/"
def download_page(url):
return requests.get(url,headers={
"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"
}).content
def parse_html(html):
soup=BeautifulSoup(html,"html.parser")
qiushi_list_soup=soup.find("div",attrs={"id":"content-left","class":"col1"})
qiushi_content_list=[]
for qiushi_li in qiushi_list_soup.find_all("div",attrs={"id":"qiushi_tag_","class":"article block untagged mb15 typs_hot"}):
detail=qiushi_li.find("div",attrs={"class":"content"})
qiushi_content=detail.find("span").getText()
qiushi_content_list.append(qiushi_content)
next_page=soup.find("a",attrs={"href":"/text/page/2/","rel":"nofollow"}).find("span",attrs={"next"})
if next_page:
return qiushi_content_list,DOWNLOAD_URL + next_page["href"]
return qiushi_content_list,None
def main():
url=DOWNLOAD_URL
with codecs.open("qiushis","wb",encoding="utf-8") as fp:
while url:
html=download_page(url)
qiushis, url=parse_html(html)
fp.write(u"{qiushis}
".format(qiushis="
".join(qiushis)))
if __name__ == "__main__":
main()
防止不條理截了個圖：

運行結果：

網頁部分：（爬的是這個頁面，段子全是文字）

段子所在列表：

內容所在標籤：

頁面跳轉：

自己的分析：
requests請求沒有返還結果，這部分代碼可能有問題（qiushi_tag_後面還有一串無序數字，所以只寫了相同的部分）：
qiushi_list_soup.find_all("div",attrs={"id":"qiushi_tag_","class":"article block untagged mb15 typs_hot"}):
還有頁面跳轉部分：
next_page=soup.find("a",attrs={"href":"/text/page/2/","rel":"nofollow"}).find("span",attrs={"next"})
if next_page: return qiushi_content_list,DOWNLOAD_URL + next_page["href"]
總覺得哪裡寫的不對，由於水平的限制，又找不出來，筆者小白一個，沒說明白的地方還請多多包涵！

跑了下你的程序，沒有獲取到任何內容，首先「糗事」沒獲取到，其次url就算獲取到了的話，是「/text/page/2」你沒經過處理連接起來之後是

https://www.qiushibaike.com/text//text/page/2

奉上自己的代碼

# Python3.5 import re import codecs import requests


DOWNLOAD_URL = "https://www.qiushibaike.com/text/"
def download_page(url):

    return requests.get(url, headers={

        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"}).text
def parse_html(html):

    nextpage = ""

    content_list = []

    nextpage_reg = "&]*?&>s下一頁"

    content_reg = "&s*?&([wW]*?)&"
    content_list = re.findall(content_reg, html)

    if re.findall(nextpage_reg, html):

        nextpage = re.findall(nextpage_reg, html)[0]

        nextpage = "{}{}".format(DOWNLOAD_URL, nextpage.replace("/text/", ""))
    return content_list, nextpage
def main():

    url = DOWNLOAD_URL

    with codecs.open("qiushis", "wb", encoding="utf-8") as fp:

        while url:

            html = download_page(url)

            qiushis, url = parse_html(html)

            for item in qiushis:

                fp.write(item.strip().replace("&
", "

") + "
")

if __name__ == "__main__": main()

import requests from lxml import html


URL = "https://www.qiushibaike.com"

headers = {

    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"

}

r = requests.get(URL + "/text/",headers = headers) tree = html.fromstring(r.text) text_eles = tree.xpath("//a[@class = "contentHerf"]") urls_end = [text_ele.attrib["href"] for text_ele in text_eles] text_urls = [requests.get(URL + text_url,headers = headers) for text_url in urls_end] trees_next = [html.fromstring(text_url.text) for text_url in text_urls] texts = [tree_next.xpath("//div[@class = "content"]/text()") for tree_next in trees_next] text = ["".join(textstr) for textstr in texts ] file = "duanzi.txt" with open(file, "w+",encoding = "utf-8") as f: for textstr in text: if not textstr == "": f.write(textstr) else: f.close()

xpath大法好