python如何爬取字幕組網站的電影鏈接地址?

在網上查到的以前的爬蟲是直接爬取http://www.zimuzu.tv/resource/xxxxx頁面下的鏈接,那是以前的方法了。現在在resource頁面下需要點擊一個「點擊查看本片全部資源下載頁」才能獲取一個限時的動態鏈接,裡面有所有的下載資源。如圖所示:

於是我就想通過直接爬取這個http://www.zimuzu.tv/resource/xxxxx頁面獲取這個動態鏈接。然而使用python requests庫爬取的頁面顯示,在原本應該有下載鏈接的地方是這樣的:

於是我通過burp抓取,發現頁面是一直到載入了一個http://c.cnzz.com/core.php?web_id=1254180690t=z後,下載頁面鏈接才顯示出來,詳細抓取情況如圖:

返回的是一大段js代碼,看不懂。

求大神指教,這種情況下要怎麼能夠獲得這個動態下載頁面的鏈接呢?修改User-Agent的方法已經嘗試過了。

這是使用chrome開發者工具查看頁面完全載入完成後的html源碼,和使用requests請求的完全是兩回事。應該是頁面載入了某個js代碼將resource-box里的內容給覆蓋了。


以下圖電影為例:

獲取:

python3代碼:

import re

import requests

RegEx = "&&

跳轉後結果:

獲取結果:

python3代碼:

import re
import requests

RegEx = "&[^&<]+&

]+&>([^&<]+)&"
url = "http://xiazai002.com/l356f2"
header = {
"Host": "xiazai002.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0",
"Accept": "text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate", "Referer": "http://www.zimuzu.tv/resource/25610", "Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
}

url_content = requests.get(url, header).text

url_list = re.findall(RegEx, url_content)
for item in url_list:
print("{} {}".format(item[1], item[0]))


import re
import requests
import bs4
from bs4 import BeautifulSoup

header = {"Host": "xiazai002.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0",
"Accept": "text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Referer": "",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"}

class Resource:
def __init__(self, name, type, source_num):
self.name = name
self.type = type
self.source_num = source_num

class Tab:
def __init__(self, name, source_li, tab_num_list):
self.name = name
self.source_li = source_li
self.tab_num_list = tab_num_list

def get_size(self):
return len(self.source_li)

def print_source_li(self):
i = 1
for li in self.source_li:
print(str(i)+". "+li)
i += 1

def get_actual_links(soup, tab_num):
frame = soup.find("div", attrs={"role": "tabpanel", "class": "tab-pane", "id": tab_num})
whole_season = frame.find("ul", attrs={"class": "down-list"})
episodes = whole_season.children
episode_list = []
for e in episodes:
if e.name == "li":
episode_list.append(e)
# down_links = []
for e in episode_list:
source_name = e.find("div", class_="title").find("span", class_="filename").string
source_size = e.find("div", class_="title").find("span", class_="filesize").string
try:
p = e.find("p", attrs={"class": "desc"}, text="磁力")
link = p.parent.get("href")
except AttributeError:
try:
p = e.find("p", attrs={"class": "desc"}, text="電驢")
link = p.parent.get("href")
except AttributeError:
link = ""
finally:
if link != "":
print("[*] &<" + source_name + "> " + source_size + ":")
print(link)
print()
# down_links.append(link)
else:
print("[*] 無磁力或電驢鏈接")
print()

def get_download_links(url, t):
r = requests.get(url, header)
soup = BeautifulSoup(r.text, "lxml")
side_bar = soup.find("div", attrs={"class": "sidebar-warpper", "id": "scrollspy"}).ul
tabs = side_bar.children
tab_list = []
for tab in tabs:
if tab.name == "li":
tmp_list = []
tmp_list2 = []
name = tab.find("a").string
ul = tab.ul
if ul:
tabs2 = tab.ul.children
for child in tabs2:
if child.name == "li":
a = child.find("a")
items = a.children
for i in items:
if type(i) == bs4.element.NavigableString and i != "在線看":
tmp_list.append(i)
tmp_list2.append(a.get("aria-controls"))
else:
a = tab.find("a")
items = a.children
for i in items:
if type(i) == bs4.element.NavigableString and i != "在線看":
tmp_list.append(i)
tmp_list2.append(a.get("aria-controls"))
tab_list.append(Tab(name, tmp_list, tmp_list2))
else:
continue

if len(tab_list) != 0:
if len(tab_list) == 1 and t == "電影":
season_choose = tab_list[0]
else:
i = 1
for tab in tab_list:
print(str(i) + ". " + tab.name)
i += 1
season_choose = tab_list[int(input("選擇: ")) - 1]
season_choose.print_source_li()
type_choose = int(input("選擇: ")) - 1
get_actual_links(soup, season_choose.tab_num_list[type_choose])
else:
print("沒有資源")

def get_resource_link(t, source_number):
RegEx = "&&


#!/usr/bin/env python
# -*- coding: utf-8 -*-
# python3
"""
Package : ZimuzuTool
Function : From zimuzu.tv scrap info about movie and tv.
Author : bihuchao &
"""

import re
import json
import requests
from bs4 import BeautifulSoup

class Zimuzu():
"""
Zimuzu Class"""

def __init__(self, username, password):
self.session = requests.Session()
self.session.verify = False
self.Login(username, password)
self.data = {}

def Login(self, username, password):
"""
Login"""

loginData = {
"account" : username,
"password" : password,
"remember" : "1",
"url_back" : "http://www.zimuzu.tv/",
}
self.session.post(url="http://www.zimuzu.tv/User/Login/ajaxLogin", data=loginData)

def GetDownloadInfo(self, url):
"""
GetDownloadInfo"""

response = self.session.get(url=url)
soup = BeautifulSoup(response.text, "lxml")
self.data["Name"] = str(soup.title.string)
self.data["Media"] = []
for media in soup.find_all("div", {"class":"media-list"}):
subData = {}
# 離線+在線 我是不需要的
if("離線+在線" == str(media.h2.string)):
continue
subData["Type"] = str(media.h2.string)

subData["Detail"] = []
for mediaDetail in media.find_all("li"):
mediaData = {}
mediaData["format"] = mediaDetail["format"]
mediaData["season"] = mediaDetail["season"]
mediaData["episode"] = mediaDetail["episode"]
mediaData["title"] = str(mediaDetail.find("a", {"target":"_blank"}).string)
try:
mediaData["size"] = re.findall(r"((.*?))", str(mediaDetail.find("font", {"class":"f3"}).string), re.S)[0]
except:
pass
mediaData["link"] = []
for downloadLink in mediaDetail.find("div", {"class":"fr"}).find_all("a"):
if(str(downloadLink.string) in ["迅雷-電驢", "迅雷-磁力"]):
mediaData["link"].append(
{
"Name": str(downloadLink.string),
"href": downloadLink["href"],
}
)
subData["Detail"].append(mediaData)
if(0 == len(subData["Detail"])):
continue
self.data["Media"].append(subData)

return self.data

def FitlerInfo(self, type=None, season=None, linkType=None):
"""
FitlerInfo"""

if not self.data:
return None
url = []
for subData in self.data["Media"]:
if(type and type != subData["Type"]):
continue
for mediaData in subData["Detail"]:
if(season and season != mediaData["season"]):
continue
#print(mediaData["title"])
#if("size" in mediaData):
# print(mediaData["size"])
for linkData in mediaData["link"]:
if(linkType and linkType != linkData["Name"]):
continue
url.append(linkData["href"])
#print(linkData["href"])

return url

def Tofile(self, filename=None):
"""
Tofile"""

if not self.data:
return False
if not filename:
filename = "{0}.txt".format(ToLegalFilename(self.data["Name"]))
with open(filename, "w") as f:
json.dump(self.data, f, ensure_ascii=False, indent=4)

return True

def ToLegalFilename(filename):
"""
ToLegalFilename"""

filename = filename.replace("\", "_")
filename = filename.replace("/", "_")
filename = filename.replace(":", "_")
filename = filename.replace("*", "_")
filename = filename.replace("?", "_")
filename = filename.replace(""", "_")
filename = filename.replace("&<", "_") filename = filename.replace("&>", "_")
filename = filename.replace("|", "_")
filename = filename.replace(" ", "_")

return filename

if __name__ == "__main__":
zimuzu = Zimuzu(yourUsername, yourPassword)

# 權力的遊戲,Game of Thrones
zimuzu.GetDownloadInfo("http://www.zimuzu.tv/resource/list/10733")
zimuzu.Tofile()
urls = zimuzu.FitlerInfo(type="HR-HDTV", season="1", linkType="迅雷-電驢"):

# 加勒比海盜5:死無對證,Pirates of the Caribbean: Dead Men Tell No Tales
zimuzu.GetDownloadInfo("http://www.zimuzu.tv/resource/list/35374")
zimuzu.Tofile()
urls = zimuzu.FitlerInfo(type="HR-HDTV", linkType="迅雷-電驢"):


推薦閱讀:

我該如何用PHP的cURL獲取這個奇葩的網頁內容?
使用python爬取pixiv.net的圖片?
關於scrapy的crawlspider?
爬蟲怎麼保存圖片?
我爬網站的時候爬久了網站就會響應連接超時導致之後無法正常爬取,而此時網路很好但是有好多頁面都無法打開?

TAG:Python | 計算機 | 網頁爬蟲 |