python抓取新浪微博，求教！！?

12-29

python抓取新浪微博，被擋，用了代理，有10個帳號，10個代理，爬的很慢，大家有什麼好的辦法，謝謝！！！

http://github.com/zhu327/rss 既然你也用python就直接看代碼吧

爬這裡 http://service.weibo.com/widget/widget_blog.php?uid={uid} 替換uid,無需登錄,不會被擋

爬手機端
http://weibo.cn
可以參考下面的代碼，來自極客學院，侵刪

#-*-coding:utf8-*-


import smtplib

from email.mime.text import MIMEText

import requests

from lxml import etree

import os

import time

import sys

reload(sys)

sys.setdefaultencoding("utf-8")
class mailhelper(object):

    """

    這個類實現發送郵件的功能

    """

    def __init__(self):
        self.mail_host="smtp.xxxx.com"  #設置伺服器

        self.mail_user="xxxx"    #用戶名

        self.mail_pass="xxxx"   #密碼

        self.mail_postfix="xxxx.com"  #發件箱的後綴
    def send_mail(self,to_list,sub,content):

        me="xxoohelper"+"&<"+self.mail_user+"@"+self.mail_postfix+">"

        msg = MIMEText(content,_subtype="plain",_charset="utf-8")

        msg["Subject"] = sub

        msg["From"] = me

        msg["To"] = ";".join(to_list)

        try:

            server = smtplib.SMTP()

            server.connect(self.mail_host)

            server.login(self.mail_user,self.mail_pass)

            server.sendmail(me, to_list, msg.as_string())

            server.close()

            return True

        except Exception, e:

            print str(e)

            return False
class xxoohelper(object):

    """

    這個類實現將爬取微博第一條內容

    """

    def __init__(self):

        self.url = "http://weibo.cn/u/xxxxxxx" #請輸入準備抓取的微博地址

        self.url_login = "https://login.weibo.cn/login/"

        self.new_url = self.url_login
    def getSource(self):

        html = requests.get(self.url).content

        return html
    def getData(self,html):

        selector = etree.HTML(html)

        password = selector.xpath("//input[@type="password"]/@name")[0]

        vk = selector.xpath("//input[@name="vk"]/@value")[0]

        action = selector.xpath("//form[@method="post"]/@action")[0]

        self.new_url = self.url_login + action

        data = {

            "mobile" : "xxxxx@xxx.com",

             password : "xxxxxx",

            "remember" : "on",

            "backURL" : "http://weibo.cn/u/xxxxxx", #此處請修改為微博地址

            "backTitle" : u"微博",

            "tryCount" : "",

            "vk" : vk,

            "submit" : u"登錄"

            }

        return data
    def getContent(self,data):

        newhtml = requests.post(self.new_url,data=data).content

        new_selector = etree.HTML(newhtml)

        content = new_selector.xpath("//span[@class="ctt"]")

        newcontent = unicode(content[2].xpath("string(.)")).replace("http://","")

        sendtime = new_selector.xpath("//span[@class="ct"]/text()")[0]

        sendtext = newcontent + sendtime

        return sendtext
    def tosave(self,text):

        f= open("weibo.txt","a")

        f.write(text + "

")

        f.close()
    def tocheck(self,data):

        if not os.path.exists("weibo.txt"):

            return True

        else:

            f = open("weibo.txt", "r")

            existweibo = f.readlines()

            if data + "

" in existweibo:

                return False

            else:

                return True

if __name__ == "__main__": mailto_list=["xxxxx@qq.com"] #此處填寫接收郵件的郵箱 helper = xxoohelper() while True: source = helper.getSource() data = helper.getData(source) content = helper.getContent(data) if helper.tocheck(content): if mailhelper().send_mail(mailto_list,u"女神更新啦",content): print u"發送成功" else: print u"發送失敗" helper.tosave(content) print content else: print u"pass" time.sleep(30)

據說爬手機版會有奇效。

有人在python3+selenium+firefox上調試過嗎？求指教：http://blog.csdn.net/cainiao2013/article/details/77466632

我以前爬過，不知道現在可行不

爬他的移動端頁面，當時限制比網頁端少。

爬蟲程序部署在google app engine多個節點上跑

新浪有開發者平台，有專門的API介面，用爬蟲會被屏蔽