python爬取csdn文章到wordpress
# /usr/bin/env python# coding=utf8python 爬取csdn 文章到 wordpress import requestsimport reimport jsonimport timefrom bs4 import BeautifulSoupfrom lxml import etreefrom wordpress_xmlrpc import Client, WordPressPostfrom wordpress_xmlrpc.methods.posts import NewPostfrom csdn import CSDNimport pymysqlclass WordPress: def __init__(self): self.wp = Client(http://blog.zxb8.cc/xmlrpc.php, username, password) self.conn = pymysql.connect(host=104.224.151.80, port=3306, user=xxx, passwd=xx, db=blog,charset=utf8) self.cursor = self.conn.cursor() #通過 xmlrpc 方式導入數據到資料庫 def sends(self,title,content): # 鏈接 WordPress,輸入 xmlrpc 鏈接,後台賬號密碼 post = WordPressPost() post.title = title # post.post_type=tag post.content = content post.post_status = publish # 發送到 WordPress # print here3 self.wp.call(NewPost(post)) time.sleep(3) print(發布成功) #導入數據 def create(self,url): print(url) csdn = CSDN(url) title = csdn.getTitle() content = csdn.getContent() img = csdn.getImg() print(img) if len(img) >0: content += " ".join(img) #self.sends(title, content) self.query(title,content,1) time.sleep(3) print(發布成功) #通過 pymysql 數據驅動導入資料庫 #根據 mysql binlog 日誌分析出需要插入和更新的表 def query(self,title,content,cat): #替換 content = content.replace("【工匠若水 http://blog.csdn.net/yanbober 未經允許嚴禁轉載,請尊重作者勞動成果。私信聯繫我】","") times = time.strftime(%Y-%m-%d %H:%M:%S, time.localtime(time.time())) #設置高亮顯示 content = <pre class ="pure-highlightjs"> <code class =""> + content +</code> </pre> #轉義 content = pymysql.escape_string(content) #插入 post sql_post = "INSERT INTO wp_posts(post_author,post_date,post_content,post_title,post_excerpt,post_status,comment_status,ping_status,post_name,to_ping,pinged,post_modified,post_content_filtered,post_parent,menu_order,post_type,comment_count) VALUES (1,%s,%s,%s,,publish,open,open,%s,,,%s,,0,0,post,0)" % ( str(times), str(content), str(title), str(title), str(times)) self.cursor.execute(sql_post) new_id = self.cursor.lastrowid #更新 guid guid = "http://blog.zxb8.cc/?p={}".format(new_id) update_sql="UPDATE `wp_posts` SET `guid` = %s WHERE `ID` = %d" %(guid,new_id) self.cursor.execute(update_sql) #插入分類 sql_cat = "INSERT INTO wp_term_relationships(object_id,term_taxonomy_id,term_order) VALUES (%s,%s,0)" % (new_id, cat) self.cursor.execute(sql_cat) #提交 self.conn.commit() # self.cursor.close() # self.conn.close()if __name__ == __main__: headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } wordpress = WordPress() list_url = "https://blog.csdn.net/yanbober/article/category/6971209" response = requests.get(list_url, headers=headers) selector = etree.HTML(response.text) # for url in selector.xpath(//li[@class="blog-unit"]/a/@href): for url in selector.xpath(//div[@class="article_title"]//a/@href): print(正在努力爬取中..., url) wordpress.create(url) # i = 1 # while i<=5: # url = "https://blog.csdn.net/mrlevo520/article/list/{}".format(i) # i=i+1 # response = requests.get(list_url,headers=headers) # selector = etree.HTML(response.text) # for url in selector.xpath(//li[@class="blog-unit"]/a/@href): # print(正在努力爬取中...,url) # wordpress.create(url)#/usr/bin/env python# -*- coding:utf-8 -*-#https://blog.csdn.net/MrLevo520/article/details/53158050import requestsimport jsonimport osfrom lxml import etreeimport timeimport randomfrom datetime import *class CSDN(): def __init__(self,url): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } self.url = url self.selector = etree.HTML(self.getHtml()) #獲取正文內容 def getHtml(self): response = requests.get(self.url) return response.text #獲取標題 def getTitle(self): # title = self.selector.xpath(//h1[@class="csdn_top"]/text()) title = self.selector.xpath(//span[@class="link_title"]/a/text()) return title[0].strip() #獲取標籤內容 def getTag(self): tags = self.selector.xpath(//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_l"]//a) taglist = [] for tag in tags: taglist.append(tag.xpath(./text())[0]) return taglist def getReadNum(self): time = self.selector.xpath(//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_r"]/span[1]/text()) read = self.selector.xpath(//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_r"]/span[2]/text()) comment = self.selector.xpath(//div[@id="article_details"]/div[contains(@class,"article_manage")]//div[@class="article_r"]/span[3]/text()) print(time) print(read) print(comment) # 獲取標籤內容 def getContent(self): content = self.selector.xpath(//div[@class="markdown_views"]) #xpath 獲取多個標籤下的 text return content[0].xpath(string(.)).strip() #下載圖片 def getImg(self): imgs = self.selector.xpath(//div[@class="markdown_views"]//img/@src) if len(imgs) < 0: return; # print(imgs) list_imgs = [] #創建文件保持目錄 upload = os.getcwd() + "/upload" if not os.path.exists(upload): os.mkdir(upload) #下載圖片並保存 for img_url in imgs: response = requests.get(img_url,headers=self.headers) nowTime = datetime.now().strftime("%Y%m%d%H%M%S") # 生成當前的時間 randomNum = random.randint(0, 100) # 生成隨機數 n,其中 0<=n<=100 if randomNum <= 10: randomNum = str(0) + str(randomNum) file_name = str(nowTime) + str(randomNum)+.jpg save_name = upload + / + file_name print(download..,save_name) with open(save_name,wb) as f: f.write(response.content) #上傳圖片 remote_pic = self.upload(save_name) if remote_pic: img_src = <img class ="alignnone size-medium" src="+remote_pic+" /> list_imgs.append(img_src) return list_imgs #上傳圖片到圖床,並返回圖片地址 def upload(self,save_name): url = https://sm.ms/api/upload # 上傳圖片 files = {smfile: open(save_name, rb)} data = {ssl: False, format: json} response = requests.post(url, files=files, data=data) result = response.text # {code: success, data: {path: /2018/04/19/5ad7fd2f7e60c.jpg, hash: vpw5S3armgducWz, # url: https://i.loli.net/2018/04/19/5ad7fd2f7e60c.jpg, size: 215024, # filename: 20160213173754690.jpg, storename: 5ad7fd2f7e60c.jpg, # width: 1366, ip: 124.207.180.37, timestamp: 1524104495, height: 688, # # delete: https://sm.ms/delete/vpw5S3armgducWz}} result = json.loads(result) print(result) if result.get(code) == success: return result[data][url]
查看更多原創文章請關注公眾公眾號:
http://weixin.qq.com/r/BEXp8TXE_GiHrXGS9xAW (二維碼自動識別)
推薦閱讀:
※Udemy上的WordPress教程促銷
※wordpress目錄文件結構說明
※WordPress 有什麼輕鬆實現 AMP 或 MIP 的姿勢嗎?
※WordPress 4.7「Vaughan」
※[AWS] 如何跳轉非 WWW到 WWW上