python抓取新浪微博,求教!!?

python抓取新浪微博,被擋,用了代理,有10個帳號,10個代理,爬的很慢,大家有什麼好的辦法,謝謝!!!


http://github.com/zhu327/rss 既然你也用python就直接看代碼吧

爬這裡 http://service.weibo.com/widget/widget_blog.php?uid={uid} 替換uid,無需登錄,不會被擋


爬手機端
http://weibo.cn
可以參考下面的代碼,來自極客學院,侵刪

#-*-coding:utf8-*-

import smtplib
from email.mime.text import MIMEText
import requests
from lxml import etree
import os
import time
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class mailhelper(object):
"""
這個類實現發送郵件的功能
"""
def __init__(self):

self.mail_host="smtp.xxxx.com" #設置伺服器
self.mail_user="xxxx" #用戶名
self.mail_pass="xxxx" #密碼
self.mail_postfix="xxxx.com" #發件箱的後綴

def send_mail(self,to_list,sub,content):
me="xxoohelper"+"&<"+self.mail_user+"@"+self.mail_postfix+">"
msg = MIMEText(content,_subtype="plain",_charset="utf-8")
msg["Subject"] = sub
msg["From"] = me
msg["To"] = ";".join(to_list)
try:
server = smtplib.SMTP()
server.connect(self.mail_host)
server.login(self.mail_user,self.mail_pass)
server.sendmail(me, to_list, msg.as_string())
server.close()
return True
except Exception, e:
print str(e)
return False

class xxoohelper(object):
"""
這個類實現將爬取微博第一條內容
"""
def __init__(self):
self.url = "http://weibo.cn/u/xxxxxxx" #請輸入準備抓取的微博地址
self.url_login = "https://login.weibo.cn/login/"
self.new_url = self.url_login

def getSource(self):
html = requests.get(self.url).content
return html

def getData(self,html):
selector = etree.HTML(html)
password = selector.xpath("//input[@type="password"]/@name")[0]
vk = selector.xpath("//input[@name="vk"]/@value")[0]
action = selector.xpath("//form[@method="post"]/@action")[0]
self.new_url = self.url_login + action
data = {
"mobile" : "xxxxx@xxx.com",
password : "xxxxxx",
"remember" : "on",
"backURL" : "http://weibo.cn/u/xxxxxx", #此處請修改為微博地址
"backTitle" : u"微博",
"tryCount" : "",
"vk" : vk,
"submit" : u"登錄"
}
return data

def getContent(self,data):
newhtml = requests.post(self.new_url,data=data).content
new_selector = etree.HTML(newhtml)
content = new_selector.xpath("//span[@class="ctt"]")
newcontent = unicode(content[2].xpath("string(.)")).replace("http://","")
sendtime = new_selector.xpath("//span[@class="ct"]/text()")[0]
sendtext = newcontent + sendtime
return sendtext

def tosave(self,text):
f= open("weibo.txt","a")
f.write(text + "
")
f.close()

def tocheck(self,data):
if not os.path.exists("weibo.txt"):
return True
else:
f = open("weibo.txt", "r")
existweibo = f.readlines()
if data + "
" in existweibo:
return False
else:
return True

if __name__ == "__main__":
mailto_list=["xxxxx@qq.com"] #此處填寫接收郵件的郵箱
helper = xxoohelper()
while True:
source = helper.getSource()
data = helper.getData(source)
content = helper.getContent(data)
if helper.tocheck(content):
if mailhelper().send_mail(mailto_list,u"女神更新啦",content):
print u"發送成功"
else:
print u"發送失敗"
helper.tosave(content)
print content
else:
print u"pass"
time.sleep(30)


據說爬手機版會有奇效。


有人在python3+selenium+firefox上調試過嗎?求指教:http://blog.csdn.net/cainiao2013/article/details/77466632


我以前爬過,不知道現在可行不

爬他的移動端頁面,當時限制比網頁端少。

爬蟲程序部署在google app engine多個節點上跑


新浪有開發者平台,有專門的API介面,用爬蟲會被屏蔽


推薦閱讀:

scrapy可以進行線性/順序抓取嗎?
python爬蟲的中文亂碼問題?
如何解決Python selenium在遠程shell下無法連接瀏覽器的問題?
如何在 python 中使用 beautifulsoup4 來抓取標籤中的內容?
按鍵精靈等以GUI介面為基礎的程序在爬蟲界的地位是怎樣的?

TAG:新浪 | Python | 爬蟲計算機網路 |