Python数据分析案例12——网飞影视剧数据分析及其可视化

本文介绍: 网飞的影视剧其电影和电视剧的成分占比，发行年份、国家，影视剧类型，收视率，简介关键词等，进行一定程度的描述性统计及其可视化。柱状图，饼图，热力图，词云图等

Ne tf li x是最受欢迎的媒体和视频流平台之一。他们的平台上有超过 8000 部电影或电视节目。截至 2021 年年中，他们在全球拥有超过 2 亿订阅者。

博主看美剧也较为多，像《怪奇物语》、《性爱自修室》等高分美剧都是网飞的。

对于网飞的影视剧，我们可以分析其电影和电视剧的成分占比，发行年份、国家，影视剧类型，收视率，简介关键词等，进行一定程度的描述性统计及其可视化。从而可以得到哪些类型影视剧更受欢迎，哪些国家发行影视剧更多等等结论。

此表格数据集来源kaggle，Netflix Movies and TV Shows | Kaggle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

plt.rcParams ['font.sans-serif'] ='SimHei'              #显示中文
plt.rcParams ['axes.unicode_minus']=False               #显示负号

df=pd.read_csv('netflix_titles.csv',encoding='ANSI').dropna(how='all',axis=1).set_index('show_id')
df.head()

df=df.infer_objects()
print(df.shape)
df.info()

#观察缺失值
import missingno as msno
msno.matrix(df)

df['country'] = df['country'].fillna(df['country'].mode()[0])
df['cast'].fillna('No Data',inplace  = True)
df['director'].fillna('No Data',inplace  = True)
df.dropna(inplace=True)

df.drop_duplicates(inplace=True)

df["date_added"] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year
df['month_name_added']=df['date_added'].dt.month_name()
df['release_year']=df['release_year'].astype('int')

df.info()

plt.figure(figsize=(2,2),dpi=180)
p1=df.type.value_counts()
plt.pie(p1,labels=p1.index,autopct="%1.3f%%",shadow=True,explode=(0.2,0),colors=['royalblue','pink']) #带阴影，某一块里中心的距离
plt.title("网飞影视剧中电影和电视剧的各自占比")
plt.show()

import squarify
p2=df.country.value_counts()[:15]
fig = plt.figure(figsize = (8,4),dpi=256)
ax = fig.add_subplot(111)
plot = squarify.plot(sizes = p2, # 方块面积大小
                     label = p2.index, # 指定标签
                     #color = colors, # 指定自定义颜色
                     alpha = 0.8, # 指定透明度
                     value = p2, # 添加数值标签
                     edgecolor = 'white', # 设置边界框
                     linewidth =0.1 # 设置边框宽度
                    )
# 设置标题大小
ax.set_title('网飞影视剧数量发行量排名前15的国家',fontsize = 22)
# 去除坐标轴
ax.axis('off')
# 去除上边框和右边框刻度
ax.tick_params(top = 'off', right = 'off')
# 显示图形
plt.show()

def check0(txt):
    if txt in p2.index[:10]:
        a=True
    else:
        a=False
    return a
df_bool=df.country.astype('str').apply(check0)

p3=pd.crosstab(df[df_bool].type,df[df_bool].country,normalize='columns').T.sort_values(by='TV Show')
m =np.arange(len(p3))
plt.figure(figsize = (8,4),dpi=256)
plt.bar(x=m, height=p3.iloc[:,0], label=p3.columns[0], width=0.3,alpha=0.5, hatch='.',color='orange') 
plt.bar(x=m , height=p3.iloc[:,1], label=p3.columns[1], bottom=p3.iloc[:,0],width=0.3,alpha=0.5,hatch='*',color='lime')
plt.xticks(range(len(p3)),p3.index,fontsize=10,rotation=30)
plt.legend()
plt.ylabel('频率')
plt.title("网飞影视剧发行量前10的国家电影和电视剧数量对比")
plt.show()

p4=df.rating.value_counts()
plt.figure(figsize = (6,3),dpi=256)
sns.barplot(x=p4.index,y=p4)
plt.ylabel('数量')
plt.xlabel('评价')
plt.xticks(fontsize=10,rotation=45)
plt.title("网飞所有影视剧不同评级数量对比")
plt.show()

df_bar=pd.crosstab(df.type,df.rating).T.sort_values(by='Movie',ascending=False).unstack().reset_index().rename(columns={0:'number'})
plt.subplots(figsize = (10,4),dpi=128)
sns.barplot(x=df_bar.rating,y=df_bar.number,hue=df_bar.type,palette = "copper")

df_heatmap=df[df_bool].groupby('country')['rating'].value_counts().unstack().sort_index().fillna(0).astype(int).T#.sort_values(by='Movie',ascending=False).T
for col in df_heatmap.columns:
    df_heatmap[col]=df_heatmap[col]/df_heatmap[col].sum()
corr = plt.subplots(figsize = (8,6),dpi=256)
corr= sns.heatmap(df_heatmap,annot=True,square=True,annot_kws={'size':6,'weight':'bold', 'color':'royalblue'},fmt='.2f',cmap='cubehelix_r')
plt.title('不同发行国家的网飞影视剧评级对比')
plt.show()

plt.figure(figsize=(8,3.5),dpi=128)
colors=['tomato','orange','royalblue','lime','pink']
for i, mtv in enumerate(df['type'].value_counts().index):
    mtv_rel = df[df['type']==mtv]['year_added'].value_counts().sort_index()
    plt.plot(mtv_rel.index, mtv_rel, color=colors[i], label=mtv)
    plt.fill_between(mtv_rel.index, 0, mtv_rel, color=colors[i], alpha=0.8)
    plt.legend()
plt.ylabel('网飞发行影视剧数量')
plt.xlabel('年份')
plt.title('网飞在不同年份上映影视剧数量')
plt.show()

plt.figure(figsize=(5,5),dpi=128)
colors=['tomato','orange','royalblue','lime','pink','brown']

p5=df.month_name_added.value_counts()
plt.pie(p5,labels=p5.index,autopct="%1.3f%%",shadow=True,explode=(0.2,0.1,0.08,0.06,0.04,0.02,0,0,0,0,0,0),colors=colors) #带阴影，某一块里中心的距离
plt.title('网飞影视剧上映月份分析')
plt.show()

df_age=df.assign(age=df.year_added-df.release_year)[['type','age']]
plt.figure(figsize=(3,4),dpi=128)
sns.boxplot(x='type',y='age',width=0.8,data=df_age,orient="v") 
plt.show()

p6=df.assign(kind=df.listed_in.str.split(',')).explode('kind')['kind'].value_counts()[:15]
plt.figure(figsize=(10,4),dpi=128)
sns.barplot(y=p6.index,x=p6,orient="h")
plt.xlabel('影片数量')
plt.ylabel('影视剧类型')
plt.xticks(fontsize=10,rotation=45)
plt.title("网飞不同影视剧类型数量对比")
plt.show()

p7=df.assign(kind=df.listed_in.str.split(',')).explode('kind').where(lambda d:d.country=='United States').dropna()['kind'].value_counts()[:12]         
plt.figure(figsize=(5,5),dpi=128)
plt.pie(p7,labels=p7.index,autopct="%1.2f%%",shadow=True,explode=(0.15,0.1,0.08,0.06,0.04,0.02,0,0,0,0,0,0),colors=['c', 'b', 'g', 'tomato', 'm', 'y', 'lime', 'w','orange','pink','grey','tan']) 
plt.title('在美国制作发行的网飞影视剧类型数量对比')
plt.show()

p8=df.assign(directo=df.director.str.split(',')).explode('directo')['directo'].value_counts()[1:11]
p9=df.assign(cas=df.cast.str.split(',')).explode('cas')['cas'].value_counts()[1:11]

plt.subplots(1,2,figsize=(12,5),dpi=128)
plt.subplot(121)
sns.barplot(y=p8.index,x=p8,orient="h")
plt.ylabel('导演姓名')
plt.xlabel('导演影视剧的数量',fontsize=14)
plt.title("(a)网飞影视剧导演数量前十的导演")
 
plt.subplot(122)
sns.barplot(y=p9.index,x=p9,orient="h")
plt.ylabel('演员名字')
plt.xlabel('出演影视剧的数量',fontsize=14)
plt.title("(b)网飞影视剧出演数量前十的演员")
#plt.legend()
plt.tight_layout()
plt.show()

from wordcloud import WordCloud
import random
from PIL import Image
import matplotlib
# Custom colour map based on Netflix palette
mask = np.array(Image.open('wf.png'))

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['#221f1f', '#b20710'])
text = str(list(df['title'])).replace(',', '').replace('[', '').replace("'", '').replace(']', '').replace('.', '')
wordcloud = WordCloud(background_color = 'white', width = 500,  height = 200,colormap=cmap, max_words = 150, mask = mask).generate(text)
plt.figure( figsize=(9,5),dpi=1028)
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

text2=str(list(df['description'])).replace(',', '').replace('[', '').replace("'", '').replace(']', '').replace('.', '')
wordcloud = WordCloud(background_color = 'white', width = 500,  height = 200,colormap='coolwarm', max_words =30).generate(text2)
plt.figure( figsize=(8,4),dpi=512)
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()