NBA 1985-2017 數據分析一
數據收集:
- 爬取網站NBA賽程數據1985賽季至今所有比賽結果;
- 寫入mysql
數據概覽:
- 場均得分
- 主場優勢,勝率,主客場分差
初始數據:
主隊勝負列:主隊勝1,主隊負0
賽季 日期 主隊 主隊得分 客隊 客隊得分 主隊勝負 主隊分差 得分總和
0 1985-1986 1985-10-25 老鷹 91 子彈 100 0 -9 191
1 1985-1986 1985-10-25 公牛 116 騎士 115 1 1 231
2 1985-1986 1985-10-25 活塞 118 雄鹿 116 1 2 234
3 1985-1986 1985-10-25 勇士 105 掘金 119 0 -14 224
4 1985-1986 1985-10-25 籃網 113 凱爾特 109 1 4 222
各賽季比賽數:
98-99,11-12賽季停擺,
04年以來,每賽季總比賽在1315場左右,
其中常規賽1230場
17-18賽季至12月27日共497場
df.iloc[:,0:2].groupby(賽季).count().plot(stacked=False)
場均得分變化:
最高85-86賽季220.5分
最低98-99賽季182.5分
呈V字形,從2000年左右最低,
近年場均得分有逐漸提高的趨勢,11-12賽季相較10-11有6分的下滑
sav=df.groupby(賽季).mean().loc[:,"得分總和"]
sav.plot(stacked=False,figsize=(10,3))
主隊勝率:
主隊勝率最高在87-88賽季,68%
最低在14-15賽季,57.6%
總體逐年降低
hw=df.groupby(賽季).mean().loc[:,"主隊勝負"]
hw.plot(stacked=False,figsize=(10,3))
主客場分差:
主客場分差最高出現在85-90期間,5-6分
最低在14-15賽季 2.4分
總體呈下滑趨勢,與主隊勝率分析大體一致,主場優勢(此數據包括季後賽)不明顯
17-18至今樣本數據,分差1.96分
hdiff=df.groupby(賽季).mean().loc[:,"主隊分差"]
hdiff.plot(stacked=False,figsize=(10,3))
代碼如下:
資料庫:
建庫NBA,建表gameresult,column:g_id,season,date,host,hostscore,guest,guestscore
導入模塊:
import requests
from bs4 import BeautifulSoup
import re
from mysql import connector
資料庫類:
class MySql():
def __init__(self, host, user, password, port, db):
self.host = host
self.user = user
self.password = password
self.port = port
self.db = db
self.conn = connector.connect(host=self.host, user=self.user, passwd=self.password, port=self.port, db=self.db)
self.cursor = self.conn.cursor()
#查詢
def queryDB(self, table_name, param):
sql = select * from + table_name + " where season=%s and date=%s and guest=%s and guestscore=%s and hostscore=%s and host=%s;"
self.cursor.execute(sql,param)
def fetchRow(self):
result = self.cursor.fetchone()
return result
#插入,插入前判斷是否存在該條記錄
def insertDB(self, table_name, param):
self.queryDB(table_name, param)
count = self.fetchRow()
if count == None:
sql = "insert into " + table_name + "(season, date, guest,guestscore,hostscore,host) values(%s, %s, %s, %s, %s, %s);"
self.cursor.execute(sql,param)
def commitDB(self):
self.conn.commit()
def closeDB(self):
self.cursor.close()
self.conn.close()
爬取程序:
def search():
a1 = 1985
a2 = 1986
yl = []
while int(a1) <= 2017:
y= (%s-%s%(a1,a2))
yl.append(y)
a1,a2 = a2,str(int(a2)+1)
for season in yl:
fy = season[:4]
sy = season[-4:]
ml = (fy+-10,fy+-11,fy+-12,sy+-01,sy+-02,sy+-03,sy+-04,sy+-05,sy+-06)
for m in ml:
URL = (http://stat-nba.com/gameList_simple-%s.html%m)
if re.match(rd{4}-d{4}$,season) and int(season[-4:])-int(season[:4])==1:
req = requests.get(URL)
soup = BeautifulSoup(req.content.decode(utf-8),html.parser)
for ml in soup.find_all(div,{class:"cheight"}):
for gd in ml.find_all(font,{class:cheightdate}):
date = str(gd.string)
for gr in ml.find_all(a):
gameresult = gr.string.replace( 76人,SIXER)
guest = re.match(r^(w*?)(d*)-(d*)(w*)$,gameresult)[1]
host = re.match(r^(w*?)(d*)-(d*)(w*)$,gameresult)[4]
guestscore = re.match(r^(w*?)(d*)-(d*)(w*)$,gameresult)[2] or 0
hostscore = re.match(r^(w*?)(d*)-(d*)(w*)$,gameresult)[3] or 0
mysql = MySql(127.0.0.1,root,,3306,nba)
param = (season, date, guest,guestscore,hostscore,host)
mysql.insertDB("gameresult",param)
mysql.commitDB()
mysql.closeDB()
print(date,gameresult)
if __name__ == __main__:
search()
pandas讀取數據:
%matplotlib inline
from mysql import connector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
添加列:
def add_result(df):
hsl=[x for x in df.主隊得分]
gsl=[y for y in df.客隊得分]
rl=[]
for x,y in zip(hsl,gsl):
if x>y:
rl.append(1)
else:
rl.append(0)
df[主隊勝負]=rl
# add_result(r2017)
# r2017.head()
def add_diff(df):
hsl=[x for x in df.主隊得分]
gsl=[y for y in df.客隊得分]
rl=[]
for x,y in zip(hsl,gsl):
rl.append(x-y)
df[主隊分差]=rl
def add_sum(df):
hsl=[x for x in df.主隊得分]
gsl=[y for y in df.客隊得分]
rl=[]
for x,y in zip(hsl,gsl):
rl.append(x+y)
df[得分總和]=rl
導出數據:
conn = connector.connect(user = root,password=,database = nba)
cursor = conn.cursor()
cursor.execute("select season,date,host,hostscore,guest,guestscore from gameresult where guestscore>0 order by date")
value = cursor.fetchall()
# print(value)
df = pd.DataFrame(value,columns=(賽季,日期,主隊,主隊得分,客隊,客隊得分))
add_result(df)
add_diff(df)
add_sum(df)
推薦閱讀: