如何優雅的用腳本自動下載B站收藏夾的視頻
感謝評論區的某些更細節的實現.昨天重寫了邏輯,改進了某些繁雜的步驟和操作.
import requestsfrom bs4 import BeautifulSoupimport jsonimport osimport reimport subprocessclass BiliBili: Appkey = f3bb208b3d081dc8 mid_pattern=re.compile(var _bili_space_mid = (d+);) cid_pattern=re.compile(cid=(d+)&aid=) headers = { Host: passport.bilibili.com, User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36, Referer: https://passport.bilibili.com/ajax/miniLogin/minilogin, Content-Type: application/x-www-form-urlencoded, Connection: keep-alive, Origin: https://passport.bilibili.com } post_url = https://passport.bilibili.com/ajax/miniLogin/login ajax_base_url = http://space.bilibili.com/ajax/fav/getBoxList?mid= def __init__(self,username=0,password=0): self.session=requests.session() self.post = { keep: 0, userid: username, pwd: password, captcha: } self.mid_url=None self.aid_list=[] self.more_p_url=[] self.more_p_name=[] self.cid_list=[] def cookies_login(self): with open("bilibili_cookies",r) as f: cookies=json.load(f) self.session.cookies.update(cookies) def password_login(self): r=self.session.post(url=self.post_url,data=self.post,headers=self.headers) json_data=json.loads(r.text) if json_data[status]: print("password login success!") with open("bilibili_cookies",w) as f: json.dump(self.session.cookies.get_dict(),f) def fetch_your_collection_mid(self): page_source=self.session.get(http://space.bilibili.com/#!/favlist).text mid=self.mid_pattern.findall(page_source)[0] self.mid_url=self.ajax_base_url+mid def fetch_your_collection_av_number(self): information=self.session.get(self.mid_url).text json_data=json.loads(information) json_data: {status: True, data: {list: [{videos: [{aid: 4801304, pic: http://i0.hdslb.com/bfs/archive/ea08d36d2e6a0b01cd41927fcd50562f3569e97f.jpg_320x200.jpg}, {aid: 4834239, pic: http://i0.hdslb.com/bfs/archive/277fc1bfdaeaaaeb88d1361d81982212561f4a2d.jpg_320x200.jpg}], ctime: 1442651554, fav_box: 15453253, max_count: 200, atten_count: 0, count: 2, state: 0, name: 默認收藏夾}, {videos: [{aid: 4624829, pic: http://i0.hdslb.com/bfs/archive/d6742b854443c0422efbecd43d9003156b449931.jpg_320x200.jpg}, {aid: 4572171, pic: http://i0.hdslb.com/bfs/archive/1631e3099a41b4b5cd985e10d291972c3507abaf.jpg_320x200.jpg}], ctime: 1465127326, fav_box: 30047510, max_count: 150, atten_count: 0, count: 2, state: 2, name: 哦.interesting}], count: 2}} for element in json_data[data][list]: for aid in element[videos]: self.aid_list.append(aid[aid]) def get_the_source_information(self): for aid in self.aid_list: information=self.session.get(http://www.bilibili.com/video/av{}/.format(aid)).text soup=BeautifulSoup(information,lxml) if len(soup.findAll(option))!=0: for element in soup.findAll(option): self.more_p_url.append(http://www.bilibili.com+element[value]) self.more_p_name.append(element.string) else: self.more_p_url.append(http://www.bilibili.com/video/av{}/.format(aid)) self.more_p_name.append(soup.find("div",{"class":"qr-info-head"}).string) print(self.more_p_url) print(self.more_p_name) def fetch_cid(self): for element in self.more_p_url: information=self.session.get(element).text data=self.cid_pattern.findall(information)[0] self.cid_list.append(data) print(self.cid_list) def dowload_by_flv_or_mp4(self,url,name): if ".flv" in url: with open(name + ".flv", "wb") as f: print("Begin to dowload" + name+".flv") f.write(self.session.get(url).content) print(finished!) elif ".mp4" in url: with open(name + ".mp4", wb) as f: print("Begin to dowload" + name+".mp4") f.write(self.session.get(url).content) print("finished!") def ffmpeg_txt(self,url,count): with open(hello.txt, a) as r: if .flv in url: r.write("file" + + "{}".format(str(count)+.flv) +
) elif .mp4 in url: r.write("file" + + "{}".format(str(count)+.mp4) +
) def get_movies_information(self): for index in range(len(self.cid_list)): information=self.session.get(http://interface.bilibili.com/playurl?appkey=+self.Appkey+&cid=+self.cid_list[index]+&otype=json).text json_data=json.loads(information) if len(json_data[durl])==1: try: url=json_data[durl][0][url] self.dowload_by_flv_or_mp4(url,self.more_p_name[index]) except: if len(json_data[durl][0][backup_url])==0: print("failed to dowload"+self.more_p_name[index]) else: try: for element in json_data[durl][0][backup_url]: self.dowload_by_flv_or_mp4(element,self.more_p_name[index]) except: print(faild to dowload+self.more_p_name[index]) else: count=1 for element in json_data[durl]: try: url=element[url] self.dowload_by_flv_or_mp4(url,str(count)) self.ffmpeg_txt(url,count) count+=1 except: try: for url in element[backup_url]: self.dowload_by_flv_or_mp4(url,str(count)) self.ffmpeg_txt(url,count) count+=1 except: print("failed to dowload"+self.more_p_name[index]) command = [ffmpeg, -f, concat, -i, hello.txt, -c, copy,{}.flv.format(self.more_p_name[index])] subprocess.call(command) for i in range(1,count+1): if str(i)+".flv" in os.listdir(.): command=[rm,-rf,str(i)+.flv] subprocess.call(command) elif str(i)+".mp4" in os.listdir(.): command=[rm,-rf,str(i)+.mp4] subprocess.call(command) command=[rm,-rf,hello.txt] subprocess.call(command) def start(self): if "bilibili_cookies" in os.listdir(.): try: self.cookies_login() except: self.password_login() else: self.password_login() self.fetch_your_collection_mid() self.fetch_your_collection_av_number() self.get_the_source_information() self.fetch_cid() self.get_movies_information()#http://www.bilibili.com/video/av3749039/#http://www.bilibili.com/video/av4801304/if __name__=="__main__": cookies=BiliBili(your phone number,your password) cookies.start()
默認登陸一次之後就使用cookies登陸了.
使用ffmpeg合成視頻.
==========================
這應該算是Python的簡單練手小項目吧? :P.大概思路是:
1.模擬登陸到B站.
2.嘗試性獲取收藏夾裡面的視頻av號
3.批量下載收藏夾裡面的視頻
關於第三條,當初有兩個想法:
1.調用you-get.見github項目
2.或者自己寫邏輯:P.
首先,先看第一種,利用you-get來下載,首先你得保證自己安裝了you-get :P.
import requestsimport urllib.requestimport jsonimport subprocessbase_url=https://passport.bilibili.com/ajax/miniLogin/miniloginpost_url=https://passport.bilibili.com/ajax/miniLogin/loginajax_base_url=http://space.bilibili.com/ajax/fav/getBoxList?mid=headers = { Host: passport.bilibili.com, User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36, Referer: https://passport.bilibili.com/ajax/miniLogin/minilogin, Content-Type: application/x-www-form-urlencoded, Connection: keep-alive, Origin: https://passport.bilibili.com}#該類是為了獲取bilibili的cookiesclass Cookies: def __init__(self,username,password): self.base_url=base_url self.post_url=post_url self.headers=headers self.request=requests self.post={ keep: 0, userid: username, pwd: password, captcha: } def to_add_headers_cookies(self): r=self.request.get(self.base_url,headers=self.headers).cookies for i in r: self.headers[Cookie]=i.name+=+i.value def get_cookies(self): s=self.request.post(self.post_url,data=self.post,headers=self.headers) with open("cookies_request",w) as f: for i in s.cookies: f.write(i.name+"="+i.value+;) def start(self): self.to_add_headers_cookies() self.get_cookies()class GetMovies: def __init__(self): #收藏夾頁面 self._url=http://space.bilibili.com/#!/favlist self._session=requests.session() self._headers=None #該方法是打開保存cookie的文件並把cookie值加入到自己的header裡面去 def get_cookies(self): with open(cookies_request, r) as f: data = f.read() data = data.replace(" ", "").split(;) headers = {} for i in range(4): index = data[i].find(=) headers[data[i][0:index]] = data[i][index + 1::] self._headers=headers #獲取到movie有關的信息 def get_movies(self): data=self._session.get(self._url,cookies=self._headers) with open(data.txt,w) as f: f.write(data.text) def start(self): self.get_cookies() self.get_movies()class Data: def __init__(self): with open(data.txt,r) as f: self._data=f.read() self.url_more=None self.json=None self.base_url=ajax_base_url #每個up主都有獨一無二的mid值 def find_the_mid(self): index = self._data.find(_bili_space_mid) data_more = self._data[index::] index_more=data_more.find(";") mid_data = data_more[0:index_more].replace(" ", "").split("=")[1] url_more = self.base_url+ mid_data self.url_more=url_more def get_json(self): base_json=urllib.request.urlopen(self.url_more).read().decode("utf-8") orgin_json=json.loads(base_json) self.json=orgin_json def get_aid(self): AID_list=[] for i in self.json[data][list]: for j in i[videos]: AID_list.append(j[aid]) return AID_list def start(self): self.find_the_mid() self.get_json() return self.get_aid()def dowload(av_list): av_url_list=[] for i in av_list: av_url_list.append(http://www.bilibili.com/video/av+str(i)+/) for j in av_url_list: command=[you-get,j] subprocess.call(command)if __name__=="__main__": username=input("enter your phone number") pwd=input("enter your password") cookies=Cookies(username,pwd) cookies.start() getMovies=GetMovies() getMovies.start() data=Data() dowload(data.start())
基本邏輯很簡單:P.代碼渣,僅供參考
第二種 通過自己寫下載邏輯來實現.
首先參考:you-get上的bilibili分支
可以知道,B站的視頻源為http://interface.bilibili.com/playurl?appkey=+Appkey+"&cid="+cid
Appkey是固定的
每個bilibili視頻都有一個cid值.
打開這個視頻源後,會發現如果一個視頻長度較長的話,都會將一個24分鐘的視頻分成多個獨立的url去下載
假如視頻有很多個p的話,比如命運石之門,就需要找一找每個地方的對應關係了:P
從主視頻界面解析前端代碼.然後將每個分p的cid和name對應起來
大概就是
dict[cid]=name
之後就是訪問每個視頻的視頻源 了:P
然後同樣
#將cid的值和視頻源中篩選過的視頻源對應起來,,其中視頻源是一個listdict_1[cid]=[]
下面的邏輯就非常清晰了..
在這裡我的代碼默認利用了ffmpeg來合成分p視頻:P.還有linux 下rm -rf等命令,所以應該只能在Linux的環境才能跑起來:P.但是思路是相通的.
import urllib.requestimport requestsfrom bs4 import BeautifulSoupimport reimport subprocesspath=re.compile(r<url>(.+?)</url>)Appkey=f3bb208b3d081dc8class Cid: def __init__(self,av_url): self.av_url=[] self.av_url.append(http://www.bilibili.com/video/av+av_url+/) self.requests=requests self.more_P_url=[] self.more_p_name=[] self.cid_more=[] self.more_p_name_cid={} def fetch_url_and_name(self): for i in self.av_url: base_data=self.requests.get(i).text soup=BeautifulSoup(base_data,lxml) if soup.findAll(option): for i in soup.findAll(option): self.more_p_name.append(i.string) self.more_P_url.append(i[value]) else: self.more_p_name.append(soup.findAll(div,{"class","qr-info-head"})[0].string) self.more_P_url.append(i) def fetch_cid(self): for i in self.more_P_url: if "http://" not in i: more_url="http://www.bilibili.com"+i else: more_url=i base_data=self.requests.get(more_url).text index=base_data.find(cid) index_right=base_data.find(&aid) cid=base_data[index+4:index_right] self.cid_more.append(cid) count=0 for i in self.cid_more: self.more_p_name_cid[i]=self.more_p_name[count] count=count+1 def start(self): self.fetch_url_and_name() self.fetch_cid()class Download: def __init__(self,cid,cid_name): self.cid=cid self.cid_name=cid_name self.request=requests self.base_cid_url=http://interface.bilibili.com/playurl?appkey=+Appkey+"&cid=" self.cid_url_list={} self.cid_dowload_list={} def get_dowload_url(self): for j in self.cid: url_list = [] cid_url=self.base_cid_url+j XMLDATA=self.request.get(cid_url).text url_data=path.findall(XMLDATA) for i in url_data: index = i.find(CDATA[) index = index + len("CDATA[") data_More = i[index::] index_right = data_More.find("]") url_list.append(data_More[0:index_right]) if not url_list: print("interface 抓取不到的視頻源!出現錯誤!") self.cid_url_list[j]=url_list def begin_dowload(self): cid_dowload_List={} for i in self.cid_name.keys(): P_lists=[-1.flv?,-2.flv?,-3.flv?,-4.flv?,-5.flv?,-6.flv?,-7.flv?,-8.flv?,-9.flv?] dowload_list=[] l=0 for k in self.cid_url_list[i]: if P_lists[l] in k: dowload_list.append(k) l=l+1 cid_dowload_List[i]=dowload_list self.cid_dowload_list=cid_dowload_List print(cid_dowload_List) def dowload(self): sorted_list=sorted(self.cid_dowload_list.items(),key=lambda d:d[0]) for i in sorted_list: print(i) flv_name=[] count=1 for j in i[1]: print(j) try: with open(str(count)+.flv,wb) as f: name=str(count)+.flv flv_name.append(name) count=count+1 print("begin to dowload{}".format(name)) r=open(hello.txt,a) r.write("file"+ +"{}".format(name)+
) r.close() data=urllib.request.urlopen(j).read() f.write(data) print("dowload success!") except Exception as e: print(e) continue command=[ffmpeg,-f,concat,-i,hello.txt,-c,copy,{}.flv.format(self.cid_name[i[0]]+" ")] subprocess.call(command) print("dowload the flv as {}.flv".format(self.cid_name[i[0]])) for i in flv_name: command=[rm,-rf,i] subprocess.call(command) command=[rm,-rf,hello.txt] subprocess.call(command) def start(self): self.get_dowload_url() self.begin_dowload() self.dowload()if __name__==__main__: av_number=input(plz input av number:) cid=Cid(av_url=av_number) cid.start() dowload=Download(cid.cid_more,cid.more_p_name_cid) dowload.start()
我的代碼默認只篩選了flv源的url,假如說flv源沒有速度的話..那就下載不成視頻了.:P,有興趣的小夥伴可以再完善一下.
代碼渣,僅供參考:P
再提供一些小腳本的思路給小夥伴們把:P:
1.如何利用smtp-pop將自己電腦文件夾的圖片或者其他東西發送到自己的郵箱上2.如何根據自己知乎的關注人數實時的改變自己的簽名.類似於"你要不要考慮成為我的第XX個粉絲"
3.見ZhihuSayHi
最後歡迎交流,學習: P.
推薦閱讀:
※Python3如何實現兩個列表的交叉列印?
※為何大量設計模式在動態語言中不適用?
※Python的大數運算到底是根據什麼基礎原理或者演算法實現的?
※Python數據採集(爬蟲)淺談