怎麼用python爬qq空間的說說列表並保存到本地？

12-28

前幾天剛剛爬完了一百多個好友的總計5萬條說說，說一下自己的思路。
QQ空間的空間動態都是由JavaScript控制的(電腦和手機觸屏版皆是如此)，自己便選擇了Selenium作為對數據獲取的工具。
總的工具：
Python+Selenium+Firefox+MongoDB
總的過程為：
1、從QQ郵箱中獲取好友QQ號；
2、訪問好友QQ空間的說說分頁；
3、訪問並獲取說說數據；
最開始想的是從QQ空間的個人中心的好友動態中逐條獲取好友的動態，但折騰了很久發現，無論是電腦版還是觸屏版，都有一個下翻載入，再是點擊載入的Ajax，能力低下，百度谷歌了各種方法，皆無法下拉載入數據。此路只好作罷。
後來從一篇《使用Python模擬登錄QQ郵箱獲取QQ好友列表》的文章中想到，先獲取好友的QQ號，然後再把QQ號填充到QQ空間的URL中，對好友QQ空間進行訪問。
1、從QQ郵箱中獲取好友QQ號
QQ郵箱的通訊錄中有一個導出通訊錄的功能，可以選擇CSV格式，方便使用Python進行讀取操作。

打開之後是這個樣子的（一小部分）：

處理之後放入列表之中就成這樣了：

然後，進行正題：
分析QQ空間的網頁可知：
1、好友QQ空間的說說頁面URL為：http://user.qzone.qq.com/{QQ號}/311
2、說說列表的分頁亦由javascript控制：

3、正常訪問的頁面存在id為"QM_OwnerInfo_Icon"的元素；
4、說說的內容存在於另一個iframe中；
所以，爬取說說的主體部分為：
1、GET空間說說頁面的URL：

driver.get("http://user.qzone.qq.com/{}/311".format(qq))

2、判斷頁面中是否存在登錄框，如果存在則執行登錄操作：

try: driver.find_element_by_id("login_div") a = True except: a = False if a == True: driver.switch_to.frame("login_frame") driver.find_element_by_id("switcher_plogin").click() driver.find_element_by_id("u").clear()#選擇用戶名框 driver.find_element_by_id("u").send_keys({QQ號}) driver.find_element_by_id("p").clear() driver.find_element_by_id("p").send_keys({QQ密碼}) driver.find_element_by_id("login_button").click()

2、在登錄QQ空間，進入好友空間後，判斷其是否可訪問，如果可訪問，則爬取第一頁並寫入資料庫（有些拒絕訪問或是要求回答問題，在此只爬取能夠訪問到的空間說說）：

try: driver.find_element_by_id("QM_OwnerInfo_Icon") b = True except: b = False if b == True: driver.switch_to.frame("app_canvas_frame") content = driver.find_elements_by_css_selector(".content") stime = driver.find_elements_by_css_selector(".c_tx.c_tx3.goDetail") for con,sti in zip(content,stime): data = { "qq":qq, "time":sti.text, "shuos":con.text } print(data) qzone_shuos.insert_one(data)

3、有些好友說說發表的少，只有第一頁的內容，所以判斷頁面是否有第二頁的內容，如果有，繼續爬取：

try: driver.find_element_by_link_text("下一頁") d = True except: d = False if b == True: contents = driver.find_elements_by_css_selector(".content") times = driver.find_elements_by_css_selector(".c_tx.c_tx3.goDetail") for c,t in zip(contents,times): datas = { "qq":qq, "time":t.text, "shuos":c.text } print(datas) qzone_shuos.insert_one(datas)

4、使用多線程或多進程或循環或遍歷，把QQ號列表中的QQ作為參數填充到爬取的主函數中。使用多進程模塊multiprocessing的Pool類老是在第一個QQ號爬取完之後報錯，折騰不來，又有大把的時間，索性直接for循環遍歷了。
最後對爬取的數據進行了一些簡單的處理和統計：
今年七個月每月的說說發表量

今年七個月說說發表量前20

嗯，代碼挺渣的，將就著看吧= =!!!

我可以模擬登陸但是呢現在手機回頭貼代碼看贊數哈哈
request.py

#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : jerry.liangj@qq.com


from config import *

from tornado.httpclient import HTTPRequest, HTTPClient,HTTPError

import tornado.web

import tornado.gen
from time import time, localtime, strftime

import datetime

import urllib, re

import json

import base64

import rsa,tea,traceback

import os, hashlib, re, tempfile, binascii, base64

class Handler(tornado.web.RequestHandler): pubKey=rsa.PublicKey(int( "F20CE00BAE5361F8FA3AE9CEFA495362" "FF7DA1BA628F64A347F0A8C012BF0B25" "4A30CD92ABFFE7A6EE0DC424CB6166F8" "819EFA5BCCB20EDFB4AD02E412CCF579" "B1CA711D55B8B0B3AEB60153D5E0693A" "2A86F3167D7847A0CB8B00004716A909" "5D9BADC977CBB804DBDCBA6029A97108" "69A453F27DFDDF83C016D928B3CBF4C7", 16 ), 3) def fromhex(self, s): return bytes(bytearray.fromhex(s)) def pwdencode(self, vcode, uin, pwd): salt = uin.replace(r"x", "") h1 = hashlib.md5(pwd.encode()).digest() s2 = hashlib.md5(h1 + self.fromhex(salt)).hexdigest().upper() rsaH1 = binascii.b2a_hex(rsa.encrypt(h1, self.pubKey)).decode() rsaH1Len = hex(len(rsaH1) // 2)[2:] hexVcode = binascii.b2a_hex(vcode.upper().encode()).decode() vcodeLen = hex(len(hexVcode) // 2)[2:] l = len(vcodeLen) if l &< 4: vcodeLen = "0" * (4 - l) + vcodeLen l = len(rsaH1Len) if l &< 4: rsaH1Len = "0" * (4 - l) + rsaH1Len pwd1 = rsaH1Len + rsaH1 + salt + vcodeLen + hexVcode saltPwd = base64.b64encode( tea.encrypt(self.fromhex(pwd1), self.fromhex(s2)) ).decode().replace("/", "-").replace("+", "*").replace("=", "_") return saltPwd def qq_request(self): retjson = {"code":400,"content":"No Result"} try: client = HTTPClient() #init request = HTTPRequest( init_url, method="GET", headers={ "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Accept-Encoding": "gzip, deflate", "Cookie":"", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" }, request_timeout=4 ) response = client.fetch(request) init_cookie = response.headers["Set-Cookie"] #check request = HTTPRequest( checkurl, method="GET", headers={ "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Accept-Encoding": "gzip, deflate", "Cookie":init_cookie, "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" }, request_timeout=4 ) response = client.fetch(request) check_cookie = response.headers["Set-Cookie"] result = response.body.split(""") #login session = result[7] code = result[3] salt = result[5] data = { "aid":"549000929", "daid":"147", "device":2, "fp":"loginerroralert", "from_ui":1, "g":1, "h":1, "low_login_enable":0, "p":self.pwdencode(code,salt,password), "pt_3rd_aid":0, "pt_randsalt":0, "pt_uistyle":9, "pt_vcode_v1":0, "pt_verifysession_v1":session, "ptlang":2052, "ptredirect":1, "u":username, "u1":"http://m.qzone.com/infocenter?g_f=", "verifycode":code } request = HTTPRequest( login_url+urllib.urlencode(data), method="GET", headers={ "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Accept-Encoding": "gzip, deflate", "Cookie":init_cookie+check_cookie, "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" }, request_timeout=4 ) response = client.fetch(request) # print response.body #get_token fuckcookie = response.headers["Set-Cookie"] temp = fuckcookie.split(";") pt2gguin = temp[0]+";" skey = temp[7].split(",")[1]+";" ptcz = temp[28].split(",")[1]+";" uin = temp[4].split(",")[1]+";" ptsip = temp[22].split(",")[1]+";" tempcookie = pt2gguin+skey+ptcz+uin+ptsip getPtSkeyUrl = response.body.split(""")[5] # get superkey request1 = HTTPRequest( getPtSkeyUrl, method="GET", headers={ "Cookie":init_cookie+tempcookie+fuckcookie, }, request_timeout = 8, follow_redirects=False ) ptskey = "" try: response = client.fetch(request1) except HTTPError as e: fuckfuckcookie = e.response.headers["Set-Cookie"] fuckcookieTemp = fuckfuckcookie.split(";") ptskey = fuckcookieTemp[13].split("=")[1] hash1 = 5381 for i in ptskey: hash1 +=(hash1&<&<5)+ord(i) super_token = hash12147483647 jsonUrl = "http://m.qzone.com/combo?g_tk="+str(super_token)+"hostuin=3084772927action=1g_f=refresh_type=1res_type=2format=json"#refresh_type確定說說條數 finalUrl = "http://m.qzone.com/infocenter#3084772927/mine" request = HTTPRequest( jsonUrl, method="GET", headers={ "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Accept-Encoding": "gzip, deflate", "Cookie":init_cookie+fuckcookie+fuckfuckcookie, "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" }, request_timeout=4 ) response = client.fetch(request) ret = json.loads(response.body) testjson = ret["data"]["feeds"]["vFeeds"] re_regular = re.compile("[em]e(d+)[/em]") except Exception,e: retjson["code"] = 201 retjson["content"] = str(e) return retjson

config.py

TIME_OUT = 4 init_url = "http://ui.ptlogin2.qzone.com/cgi-bin/login?style=9appid=549000929daid=147pt_no_auth=1s_url=http%3A%2F%2Fm.qzone.com%2Finfocenter%3Fg_f%3D" checkurl = "http://check.ptlogin2.qzone.com/check?pt_tea=1uin=1538968615appid=549000929ptlang=2052" login_url = "http://ptlogin2.qzone.com/login?" password = "" username = ""

這是主要加密模塊
這裡藉助了別人的分析好的庫萬分感謝
否則自己當時還卡在了那三千行茫茫js裡面。。。。。
tea.py

#!/usr/bin/env python # coding=utf-8 """ The MIT License


Copyright (c) 2005 hoxide
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
QQ Crypt module.

Maintainer: Gerald &

Last change: 2015 Apr 19

"""
import struct, ctypes

from binascii import b2a_hex, a2b_hex

from random import randint
__all__ = ["encrypt", "decrypt"]
def xor(a, b):

	a1,a2 = struct.unpack("!LL", a[0:8])

	b1,b2 = struct.unpack("!LL", b[0:8])

	r = struct.pack("!LL", a1 ^ b1, a2 ^ b2)

	return r
def encipher(v, k):

	"""

	TEA coder encrypt 64 bits value, by 128 bits key,

	QQ uses 16 round TEA.

	http://www.ftp.cl.cam.ac.uk/ftp/papers/djw-rmn/djw-rmn-tea.html .
	&>&>&> c = encipher("abcdefgh", "aaaabbbbccccdddd")

	&>&>&> b2a_hex(c)

	"a557272c538d3e96"

	"""

	n=16  #qq use 16

	delta = 0x9e3779b9

	k = struct.unpack("!LLLL", k[0:16])

	y, z = map(ctypes.c_uint32, struct.unpack("!LL", v[0:8]))

	s = ctypes.c_uint32(0)

	for i in range(n):

		s.value += delta

		y.value += (z.value &<&< 4) + k[0] ^ z.value+ s.value ^ (z.value &>&> 5) + k[1]

		z.value += (y.value &<&< 4) + k[2] ^ y.value+ s.value ^ (y.value &>&> 5) + k[3]

	r = struct.pack("!LL", y.value, z.value)

	return r
def encrypt(v, k):

	"""

	Encrypt function for QQ.
	v is the message to encrypt, k is the key

	fill char is randomized (which is 0xAD in old version)

	the length of the final data is filln + 8 + len(v)
	The message is encrypted 8 bytes at at time,

	the result is:
	r = encipher( v ^ tr, key) ^ to   (*)
	`encipher` is the QQ"s TEA function.

	v is 8 bytes data to be encrypted.

	tr is the result in preceding round.

	to is the data coded in perceding round (v_pre ^ r_pre_pre)

	For the first 8 bytes "tr" and "to" is filled by zero.
	&>&>&> en = encrypt("", b2a_hex("b537a06cf3bcb33206237d7149c27bc3"))

	&>&>&> decrypt(en,  b2a_hex("b537a06cf3bcb33206237d7149c27bc3"))

	""

	"""

	vl = len(v)

	#filln = (8 - (vl + 2)) % 8

	filln = (6 - vl) % 8

	v_arr = [

		bytes(bytearray([filln | 0xf8])),

		b"xad" * (filln + 2),	# random char * (filln + 2)

		v,

		b"" * 7,

	]

	v = b"".join(v_arr)

	tr = b""*8

	to = b""*8

	r = []

	o = b"" * 8

	for i in range(0, len(v), 8):

		o = xor(v[i:i+8], tr)

		tr = xor(encipher(o, k), to)

		to = o

		r.append(tr)

	r = b"".join(r)

	return r
def decrypt(v, k):

	"""

	Decrypt function for QQ.
	according to (*) we can get:
	x  = decipher(v[i:i+8] ^ prePlain, key) ^ preCyrpt
	prePlain is the previously encrypted 8 bytes:

	   per 8 byte from v XOR previous preCyrpt

	preCrypt is previous 8 bytes of encrypted data.
	After decrypting, we must truncate the padding bytes.

	The number of padding bytes in the front of message is

	pos + 1.

	pos is the first byte of deCrypted: r[0]  0x07 + 2

	The number of padding bytes in the end is 7 (b"" * 7).

	The returned value is r[pos+1:-7].
	&>&>&> r = encrypt("", b2a_hex("b537a06cf3bcb33206237d7149c27bc3"))

	&>&>&> decrypt(r, b2a_hex("b537a06cf3bcb33206237d7149c27bc3"))

	""

	&>&>&> r = encrypt("abcdefghijklimabcdefghijklmn", b2a_hex("b537a06cf3bcb33206237d7149c27bc3"))

	&>&>&> decrypt(r, b2a_hex("b537a06cf3bcb33206237d7149c27bc3"))

	"abcdefghijklimabcdefghijklmn"

	&>&>&> import md5

	&>&>&> key = md5.new(md5.new("python").digest()).digest()

	&>&>&> data="8CE160B9F312AEC9AC8D8AEAB41A319EDF51FB4BB5E33820C77C48DFC53E2A48CD1C24B29490329D2285897A32E7B32E9830DC2D0695802EB1D9890A0223D0E36C35B24732CE12D06403975B0BC1280EA32B3EE98EAB858C40670C9E1A376AE6C7DCFADD4D45C1081571D2AF3D0F41B73BDC915C3AE542AF2C8B1364614861FC7272E33D90FA012620C18ABF76BE0B9EC0D24017C0C073C469B4376C7C08AA30"

	&>&>&> data = a2b_hex(data)

	&>&>&> b2a_hex(decrypt(data, key))

	"00553361637347436654695a354d7a51531c69f1f5dde81c4332097f0000011f4042c89732030aa4d290f9f941891ae3670bb9c21053397d05f35425c7bf80000000001f40da558a481f40000100004dc573dd2af3b28b6a13e8fa72ea138cd13aa145b0e62554fe8df4b11662a794000000000000000000000000dde81c4342c8966642c4df9142c3a4a9000a000a"
	"""

	l = len(v)

	#if l%8 !=0 or l&<16:
	#    return ""
	prePlain = decipher(v, k)
	pos = ord(prePlain[0])  0x07 + 2
	r = prePlain
	preCrypt = v[0:8]
	for i in range(8, l, 8):
		x = xor(decipher(xor(v[i:i+8], prePlain), k), preCrypt)
		prePlain = xor(x, preCrypt)
		preCrypt = v[i:i+8]
		r += x
	if r[-7:] == ""*7:
		return r[pos+1:-7]

def decipher(v, k):
	"""
	TEA decipher, decrypt  64bits value with 128 bits key.
	it"s the inverse function of TEA encrypt.

	&>&>&> c = encipher("abcdefgh", "aaaabbbbccccdddd")

	&>&>&> decipher( c, "aaaabbbbccccdddd")

	"abcdefgh"

	"""

n = 16 y, z = map(ctypes.c_uint32, struct.unpack("!LL", v[0:8])) a, b, c, d = map(ctypes.c_uint32, struct.unpack("!LLLL", k[0:16])) delta = 0x9E3779B9 s = ctypes.c_uint32(delta &<&< 4) for i in range(n): z.value -= ((y.value &<&< 4) + c.value) ^ (y.value + s.value) ^ ((y.value &>&> 5) + d.value) y.value -= ((z.value &<&< 4) + a.value) ^ (z.value + s.value) ^ ((z.value &>&> 5) + b.value) s.value -= delta return struct.pack("!LL", y.value, z.value)

最後給個blog
裡面分析了加密的流程不清楚的可以看一看
看完你就發現原來那三千行是紙老虎他說的md5就是md5 沒有做其他事
只不過它自己實現了所以代碼比較多
Python模擬登錄QQ空間（二）

ps：代碼基於tornado 有願意的可以轉為request 還是比較容易的替換下基本上就好了

本人有經驗，給你條明路: http://m.qzone.com

Selenium + PhantomJS 實現模擬登陸。知乎上有現成的代碼，可參考。

另外，不知道樓主能否爬取設置訪問許可權的QQ空間說說，還請賜教

我想問一下，能不能類似地去爬取一下別人給一個好友回復的所有空間消息呢(針對說說的評論和秘密的評論)

首先，你得模擬登陸到qq空間，其次你要分析如此多的ajax刷新。。等你搞定了，你會發現。。。。