IMDB Movie :Python數據分析報告
附代碼:
import numpy as npimport pandas as pd import matplotlib.pyplot as pltimport jsonfrom wordcloud import WordCloudimport seaborn as snsdef load_tmdb_movies(path): df = pd.read_csv(path) df[release_date] = pd.to_datetime(df[release_date]).apply(lambda x: x.date()) json_columns = [genres, keywords, production_countries, production_companies, spoken_languages] for column in json_columns: df[column] = df[column].apply(json.loads) return dfdef load_tmdb_credits(path): df=pd.read_csv(path) json_columns=[cast,crew] for column in json_columns: df[column]=df[column].apply(json.loads) return dfLOST_COLUMNS = [ actor_1_facebook_likes, actor_2_facebook_likes, actor_3_facebook_likes, aspect_ratio, cast_total_facebook_likes, color, content_rating, director_facebook_likes, facenumber_in_poster, movie_facebook_likes, movie_imdb_link, num_critic_for_reviews, num_user_for_reviews ]# Columns in TMDb that had direct equivalents in the IMDB version. # These columns can be used with old kernels just by changing the namesTMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = { budget: budget, genres: genres, revenue: gross, title: movie_title, runtime: duration, original_language: language, # its possible that spoken_languages would be a better match keywords: plot_keywords, vote_count: num_voted_users, }IMDB_COLUMNS_TO_REMAP = {imdb_score: vote_average}def safe_access(container, index_values): # return a missing value rather than an error upon indexing/key failure result = container try: for idx in index_values: result = result[idx] return result except IndexError or KeyError: return pd.np.nandef get_director(crew_data): directors = [x[name] for x in crew_data if x[job] == Director] return safe_access(directors, [0])def pipe_flatten_names(keywords): return |.join([x[name] for x in keywords])def convert_to_original_format(movies, credits): # Converts TMDb data to make it as compatible as possible with kernels built on the original version of the data. tmdb_movies = movies.copy() tmdb_movies.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES, inplace=True) tmdb_movies[title_year] = pd.to_datetime(tmdb_movies[release_date]).apply(lambda x: x.year) # Im assuming that the first production country is equivalent, but have not been able to validate this tmdb_movies[country] = tmdb_movies[production_countries].apply(lambda x: safe_access(x, [0, name])) tmdb_movies[language] = tmdb_movies[spoken_languages].apply(lambda x: safe_access(x, [0, name])) tmdb_movies[director_name] = credits[crew].apply(get_director) tmdb_movies[actor_1_name] = credits[cast].apply(lambda x: safe_access(x, [0, name])) tmdb_movies[actor_2_name] = credits[cast].apply(lambda x: safe_access(x, [1, name])) tmdb_movies[actor_3_name] = credits[cast].apply(lambda x: safe_access(x, [2, name])) tmdb_movies[actor_4_name] = credits[cast].apply(lambda x: safe_access(x, [3, name])) tmdb_movies[actor_5_name] = credits[cast].apply(lambda x: safe_access(x, [4, name])) tmdb_movies[genres] = tmdb_movies[genres].apply(pipe_flatten_names) tmdb_movies[plot_keywords] = tmdb_movies[plot_keywords].apply(pipe_flatten_names) tmdb_movies[production_companies]=tmdb_movies[production_companies].apply(pipe_flatten_names) return tmdb_moviesmovies=load_tmdb_movies(rC:aqiu mdb_5000_movies.csv)credits=load_tmdb_credits(rC:aqiu mdb_5000_credits.csv)original_format =convert_to_original_format(movies, credits)original_format=original_format.drop(homepage,1)original_format[language]=original_format[language].fillna(English)original_format[overview]=original_format[overview].fillna(U)original_format[duration]=original_format[duration].fillna(original_format[duration].mean())original_format[tagline]=original_format[tagline].fillna(U)original_format[title_year]=original_format[title_year].fillna(2015.0)original_format[country]=original_format[country].fillna(United States of America)original_format[director_name]=original_format[director_name].fillna(U)original_format[actor_1_name]=original_format[actor_1_name].fillna(U)original_format[actor_2_name]=original_format[actor_2_name].fillna(U)original_format[actor_3_name]=original_format[actor_3_name].fillna(U)original_format[actor_4_name]=original_format[actor_4_name].fillna(U)original_format[actor_5_name]=original_format[actor_5_name].fillna(U)#數據可視化:電影評分與票房f,ax=plt.subplots(figsize = (12, 8))sns.regplot(x=vote_average,y=gross,data=original_format,ax=ax)plt.xlabel(電影評分,fontsize=15)plt.ylabel(電影票房,fontsize=15)plt.title(電影評分與票房,fontsize=22)plt.grid(True)#哪種風格的電影數量最多dataGdf=pd.DataFrame()dataGdf=pd.concat([dataGdf,original_format[genres]],axis=1)dataGdf=dataGdf.reset_index(drop=True)dataGdf[genres]=dataGdf[genres].str.split(|)plt.subplots(figsize=(12,10))list1=[]for i in dataGdf[genres]: list1.extend(i)ax=pd.Series(list1).value_counts()[0:10].sort_values(ascending=True).plot.barh(width=0.9)for i, v in enumerate(pd.Series(list1).value_counts()[0:10].sort_values(ascending=True).values): ax.text(.8, i, v,fontsize=12,color=white,weight=bold)ax.patches[9].set_facecolor(r)plt.title(Top Genres)plt.show()#上映電影數量最多的年份genres_cat1 = original_format[title_year].value_counts()genres_catl = genres_cat1 / genres_cat1.sum()others = 0.01genres_catl_ = genres_catl[genres_catl>=others]genres_catl_[Other] = genres_catl[genres_catl<others].sum()explode = (len(genres_catl_))genres_catl_.plot(kind=pie, label=, startangle=10, shadow=False, figsize=(9, 9), autopct="%1.1f%%")plt.title(上映電影數量最多的年份,fontsize=22)#隨著時間推移,電影上映數量的變化df_movies_year.plot(figsize=(12,7),marker=.,color=g)plt.title(隨著時間推移 電影數量的變化,fontsize=22)plt.xlabel(年份,fontsize=15)plt.ylabel(數量,fontsize=15)plt.grid(True)#電影總票房的變化df_movies_year = original_format.groupby([title_year])[movie_title].count()movie_year_gross=original_format.groupby([title_year])[gross].sum()movie_year_gross.plot(figsize=(12,7),marker=.)plt.title(隨著時間推移 電影收入的變化,fontsize=22)plt.xlabel(年份,fontsize=15)plt.ylabel(收入,fontsize=15)plt.grid(True)#隨著時間推移,電影風格發生的變化movie_genres=original_format.loc[:,[genres,title_year]]liste_genres = set()for s in movie_genres[genres].str.split(|): liste_genres = set().union(s, liste_genres)liste_genres = list(liste_genres)liste_genres.remove()df_reduced=pd.DataFrame()df_reduced[title-year]=movie_genres[title_year]for genre in liste_genres: df_reduced[genre] = movie_genres[genres].str.contains(genre).apply(lambda x:1 if x else 0)df_reduced=df_reduced.sort_values(by=title-year,ascending=True)genre_details = list(map(str,(original_format[genres])))genre = []for i in genre_details: split_genre = list(map(str, i.split(|))) for j in split_genre: if j not in genre: genre.append(j)min_year = original_format[title_year].min()max_year = original_format[title_year].max()genre_df = pd.DataFrame(index = range(min_year, max_year + 1), columns = genre)genre_df = genre_df.fillna(value = 0)year = np.array(original_format[title_year])z = 0for i in genre_details: split_genre = list(map(str,i.split(|))) for j in split_genre: genre_df.loc[year[z], j] = genre_df.loc[year[z], j] + 1 z+=1genre_df1=genre_df.loc[1960:]genre_df1.plot(figsize=(12,8),marker=.)plt.grid(True)# Universal Pictures和Paramount Pictures之間的對比情況Companies=pd.DataFrame()Companies[production companies]=original_format[production_companies]Companies=Companies[(True-Companies[production companies].isin([]))] Companies[production companies]=Companies[production companies].str.split(|)plt.subplots(figsize=(2,1))list11=[]for i in Companies[production companies]: list11.extend(i)ax=pd.Series(list11).value_counts()[Universal Pictures:Paramount Pictures].sort_values(ascending=True).plot.barh(figsize=(5,7),width=0.2)plt.grid(True)plt.xlabel(電影數量)plt.title(兩家電影公司的電影數量對比)plt.show()
推薦閱讀:
※人民日報中央廚房獲評「2017年大數據優秀應用案例」
※BOSS直聘行業信息爬取與分析(二)
※Matplotlib中關於坐標軸的控制
※數據分析神器Tableau——讓你的數據會說話
※大眾點評數據分析