[python] re爬取HTML網頁標籤信息總結

05-20

來自專欄煲飯醬的鍋

Evernote筆記轉移至專欄以自娛

re_label_script作者： Klaus_Lyu# -*- coding:utf-8 -*-# 自定義創建文件夾並保存圖片import reimport osfrom urllib.request import urlretrievecontent = <script>var images = [{ "big":"http://i-2.yxdown.com/2015/3/18/KDkwMHgp/6381ccc0-ed65-4422-8671-b3158d6ad23e.jpg";;, "thumb":"http://i-2.yxdown.com/2015/3/18/KHgxMjAp/6381ccc0-ed65-4422-8671-b3158d6ad23e.jpg";;, "original":"http://i-2.yxdown.com/2015/3/18/6381ccc0-ed65-4422-8671-b3158d6ad23e.jpg";;, "title":"","descript":"","id":75109},{ "big":"http://i-2.yxdown.com/2015/3/18/KDkwMHgp/fec26de9-8727-424a-b272-f2827669a320.jpg";;, "thumb":"http://i-2.yxdown.com/2015/3/18/KHgxMjAp/fec26de9-8727-424a-b272-f2827669a320.jpg";;, "original":"http://i-2.yxdown.com/2015/3/18/fec26de9-8727-424a-b272-f2827669a320.jpg";;, "title":"","descript":"","id":75110},</script># 自定義函數，在創建新的文件夾# 固定，可直接套用# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~def mkdir(path): # 去除首位空格 path = path.strip() # 去除尾部符號『\』 path = path.rstrip("") # 判斷路徑是否存在 isExists = os.path.exists(path) # 去掉目錄路徑，返迴文件夾名 fp_new = os.path.basename(path) if not isExists: # 如果不存在，則創建目錄 os.makedirs(path) os.makedirs(path) print(path + 新文件夾 + fp_new + 創建成功) return True else: # 如果目錄存在則不創建 print(path + 文件夾 + fp_new + 已存在) return False# 當前路徑下創建文件夾用來保存圖片# 獲取當前路徑dir_pathdir_path = os.path.abspath(".")# dir_new 絕對路徑dir_new = dir_path + \pic_down # 新建文件夾的名字# 傳參並創建新文件夾在當前路徑下，文件夾名稱為pic_downmkdir(dir_new)# 固定，可直接套用# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~html_script = r<script>(.*?)</script>info_script = re.findall(html_script, content, re.S|re.M)for script in info_script: res_original = r"original":"(.*?)" # 原圖 pic_script = re.findall(res_original, script) for pic in pic_script: print(pic) # urlretrieve()函數下載圖片 filename = os.path.basename(pic) # 去掉目錄路徑，返迴文件名 urlretrieve(pic, dir_new + \ +filename) #下載圖片re_label_span 過濾<span></span>等標籤作者： Klaus_Lyu# -*- coding:utf-8 -*-# 過濾<span></span>等標籤import relanguage = <table class="infobox bordered vcard" stylex="width: 21em; font-size: 89%; text-align: left;" cellpadding="3"><caption stylex="text-align: center; font-size: larger;" class="fn"><b>周恩來</b></caption><tr><th>性別：</th><td>男</td>d</tr><tr><th>異名：</th><td><span class="nickname">(字) 翔宇</span></td></tr><tr><th>政黨：</th><td><span class="org"><a href="../articles/%E4%B8%AD9A.html" title="中國共產黨">中國共產黨</a></span></td></tr><tr><th>籍貫：</th><td><a href="../articles/%E6%B5%9981.html" title="浙江省">浙江省</a><a href="../articles/%E7%BB%8D82.html" title="紹興市">紹興市</a></td></tr></table># 獲取table中的tr值res_tr = r"<tr>(.*?)</tr>"info_tr = re.findall(res_tr, language, re.S|re.M)for line in info_tr: # 獲取表格第一列 th 屬性 res_th = r"<th>(.*?)</th>" info_th = re.findall(res_th, line, re.S|re.M) # print(info_th) # [性別：] [異名：] [政黨：] [籍貫：] for mm in info_th: # 處理掉href鏈接 if "href" in mm: # 如果href鏈接存在info_th中，則處理 restr = r<a href=.*?>(.*?)</a> # 只獲取a標籤的內容，不獲取鏈接 re.findall() （.*？） h = re.findall(restr, mm, re.S|re.M) print(h[0]) # 為什麼加逗號 else: print(mm) # 為什麼加逗號 # 獲取表格第二列 th 屬性 res_td = r<td>(.*?)</td> info_td = re.findall(res_td, line, re.S|re.M) for nn in info_td: # 兩個if判斷的先後順序 # 處理掉href鏈接或者rel等信息 (對於政黨中既有span 又有a標籤，由於內容是在a標籤中，不須考慮span的影響) if "href" in nn: # 判斷內容直接所屬的標籤 res_value = r<a .*?>(.*?)</a> # 處理<a href=../rel=..></a>等信息 td_value = re.findall(res_value, nn, re.S|re.M) # print(td_value) for value in td_value: # 一個td中可能會有多個href或者rel等信息 print(value) elif "span" in nn: res_value = r<span .*?>(.*?)</span> # 對於政黨中，由於已經先判斷了href，故不會執行到elif span中 td_value = re.findall(res_value, nn, re.S|re.M) for value in td_value: print(value) else: print(nn)re_label_sub img_replace br (過濾掉換行符)作者： Klaus_Lyu# -*- coding:utf-8 -*-# 獲取<img ../>中超鏈接及過濾<img>標籤import osimport revalue = <table class="infobox" stylex="width: 21em; text-align: left;" cellpadding="3"><tr bgcolor="#CDDBE8"><th colspan="2"><center class="role"><b>中華民國政治人士</b><br /></center></th></tr><tr><th>性別：</th><td>男</td></tr><tr><th>政黨：</th><td><span class="org"><img alt="中國國民黨" src="../../../../images/Kuomintang.svg.png" width_="19" height="19" border="0" /><a href="../../../../articles/%8B%E6%B0%91%E9%BB%A8.html" title="中國國民黨">中國國民黨</a></span></td></tr></table># # 過濾HTML標籤 ,<>包含的內容全部替換為空值# value = re.sub(<[^>]+>, , value) # 過濾HTML標籤 ,<>包含的內容全部替換為空值# print(value)# 先過濾掉上訴替換空值後的換行符</br>if </br> in value or in value: value = value.replace(</br>, ) # </br>替換為空值 value = value.replace( , ) # 替換為空格value = re.sub(<[^>]+>, , value) # <>包含的內容全部替換為空值, 首位有空格# 中華民國政治人士性別：男政黨：中國國民黨value = value.strip() # 去掉value首尾的空格# 中華民國政治人士性別：男政黨：中國國民黨print(value)re_label_table作者： Klaus_Lyu# -*- coding:utf-8 -*-import res = <table><tr><td>序列號</td><td>DEIN3-39CD3-2093J3</td><td>日期</td><td>2013年1月22日</td><td>售價</td><td>392.70 元</td><td>說明</td><td>僅限5用戶使用</td></tr></table>info = re.findall(r"<td>(.*?)</td><td>(.*?)</td>", s, re.S|re.M)for line in info: print(line[0],line[1]) #或者print(line) 一樣的結果# 序列號 DEIN3-39CD3-2093J3# 日期 2013年1月22日# 售價 392.70 元# 說明僅限5用戶使用# print(line[1])# DEIN3-39CD3-2093J3# 2013年1月22日# 392.70 元# 僅限5用戶使用re_label_title作者： Klaus_Lyu# -*- coding:utf-8 -*-import refrom urllib.request import urlopenrequest = urlopen("http://www.csdn.net/";).read().decode(utf-8;)print("方法一：") # re.search() 撇配第一個title_pat = r"<a.*?title=.*?(?=target=)"title_obj = re.search(title_pat, request, re.I|re.M)title = title_obj.group()print(title) # re.search() 只匹配滿足條件的第一條記錄# <a title="理解情感?—?從Keras移植到pyTorch" href="http://geek.csdn.net/news/detail/239227";;print("方法二：")title_obj = re.findall(r"(?<=<a )title=.*?(?=target=)", request, re.I|re.M)print(title_obj[0])# title="理解情感?—?從Keras移植到pyTorch" href="http://geek.csdn.net/news/detail/239227";;re_label_tr(td/th)作者： Klaus_Lyu# -*- coding:utf-8 -*-import relanguage = <tr><th>性別：</th><td>男</td></tr><tr><th>性別：</th><td>女</td></tr># 正則表達式獲取<tr></tr>之間內容# 核心代碼：res_tr = r<tr>(.*?)</tr>m_tr = re.findall(res_tr, language, re.S|re.M)# /核心代碼for line in m_tr: print(line)# 獲取表格第一列th屬性res_th = r<th>(.*?)</th>m_th = re.findall(res_th, line, re.S|re.M)for mm in m_th: print(mm)# 獲取表格第二列td屬性res_td = r<td>(.*?)</td>m_td = re.findall(res_td, line, re.S|re.M)for nn in m_td: print(nn)# results:# < th > 性別： < / th > < td > 男 < / td ># 性別：# 男# < th > 性別： < / th > < td > 女 < / td ># 性別：# 女re_label_head作者： Klaus_Lyu# -*- coding:utf-8 -*-import recontent = """<head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <title>豆瓣電影 Top 250</title> <meta http-equiv="Expires" content="Sun, 6 Mar 2005 01:00:00 GMT"> <link rel="apple-touch-icon" href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png">;; <script type="text/javascript">var _head_start = new Date();</script><link href="https://img3.doubanio.com/f/movie/dcfd6c93a0b44f2495c6ab3cdf21d8508b97bb03/css/movie/top_movies.css";; rel="stylesheet" type="text/css" /> <style type="text/css">img { max-width: 100%; }</style> <script type="text/javascript"></script> <link rel="shortcut icon" href="https://img3.doubanio.com/favicon.ico";; type="image/x-icon"></head>"""# 核心代碼：# regex = r<meta http-equiv=.*?># regex = r(?<=<meta )http-equiv=.*?(?=>)# regex = r(?<=link.*?)href=".*?(?=")|(?<=link.*?)href=.*?(?=)# 前提條件(?<=link.*?)不對，必須是確定的，不能用匹配的表達式，正確的是（?<=link ）link_href = re.findall(r(?<=<link ).*?href="(.*?)(?="), content, re.M|re.S) # refindall()只匹配括弧里的(.*？)for line in link_href: print(line) # https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png # https://img3.doubanio.com/f/movie/dcfd6c93a0b44f2495c6ab3cdf21d8508b97bb03/css/movie/top_movies.css # https://img3.doubanio.com/favicon.icolink_metal = re.findall(r(?<=<meta )http-equiv=.*?(?=>), content,)for line in link_metal: print(line) # http - equiv = "Content-Type" content = "text/html; charset=utf-8" # http - equiv = "Expires" content = "Sun, 6 Mar 2005 01:00:00 GMT"link_metal = re.findall(r<meta http-equiv=.*?>, content,)for line in link_metal: print(line) # <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> # <meta http-equiv="Expires" content="Sun, 6 Mar 2005 01:00:00 GMT">re_label_href作者： Klaus_Lyu# -*- coding:utf-8 -*-import refrom urllib.request import urlopen# !!!!!爬取豆瓣top250首頁的源代碼# 自定義函數獲取網頁源代碼，自動獲取網站編碼格式並按相應格式解碼賦值給requestdef download(html): urlorgs = urlopen(html).read() # 檢測url的編碼格式 # char_url = chardet.detect(urlorgs) # print(char_url[encoding]) # print(char_url) # {encoding: GB2312, confidence: 0.99, language: Chinese} # url按照對應的編碼格式進行解碼輸出, chardet.detect()內容為key-value字典 # request = urlorgs.decode(char_url[encoding]) request = urlorgs.decode(utf-8) return request# 傳參 html,獲得正確編碼後的網頁源代碼request = download("https://movie.douban.com/top250";;)# print(request)# 爬鏈接要有針對性地根據目標值來觀察鏈接的特徵，依據特徵進行針對性地爬取# # 爬取a標籤中所有URL鏈接（目測意義不大）urls = re.findall(r"<a.*?href=.*?</a>", request, re.I|re.M)for url in urls: print(url)# 爬取所有href前綴的link（目測也是意義不大）link_list = re.findall(r"(?<=href=").+?(?=")|(?<=href=).+?(?=)", request)for url in link_list: print(url)