Python 之 Pandas merge() 函数、set_index() 函数、drop_duplicates() 函数和 tolist() 函数

本文介绍: merge() 函数、set_index() 函数、drop_duplicate s() 函数、tolist() 函视频数据分析案例。

import numpy as np
import pandas as pd

pd.merge(left,right,how: str = 'inner',on=None,left_on=None,right_on=None,left_index: bool = False,
right_index: bool = False,sort: bool = False,suffixes=('_x', '_y'),copy: bool = True,indicator: bool = False,validate=None,)

在这里插入图片描述

df_1 = pd.DataFrame({
                     "userid":['a', 'b', 'c', 'd'], 
                     "age":[23, 46, 32, 19]
                    })
df_1 
#  userid age
#0	a	  23
#1	b	  46
#2	c	  32
#3	d	  19

df_2 = pd.DataFrame({
        "userid":['a', 'c'],
        "payment":[2000, 3500]
    })
df_2
#userid	payment
#0	a	2000
#1	c	3500

df_1.merge(df_2,on='userid')
#userid	age	payment
#0	a	23	2000
#1	c	32	3500

pd.merge(df_1, df_2, on='userid')
#userid	age	payment
#0	a	23	2000
#1	c	32	3500

在这里插入图片描述

df_1 = pd.DataFrame({
                     "userid":['a', 'b', 'c', 'd'], 
                     "age":[23, 46, 32, 19]
                    })
df_2 = pd.DataFrame({
        "userid":['a', 'c','a', 'd'],
        "payment":[2000, 3500, 500, 1000]
    })
pd.merge(df_1, df_2, on="userid")
#userid	age	payment
#0	a	23	2000
#1	a	23	500
#2	c	32	3500
#3	d	19	1000

在这里插入图片描述

df_1 = pd.DataFrame({
                     "userid":['a', 'b', 'c', 'd'], 
                     "age":[23, 46, 32, 19]
                    })
df_2 = pd.DataFrame({
        "userid":['a', 'c','e'],
        "payment":[2000, 3500, 600]
    })
pd.merge(df_1, df_2,how='left', on="userid")
#userid	age	payment
#0	a	23	2000.0
#1	b	46	NaN
#2	c	32	3500.0
#3	d	19	NaN

pd.merge(df_1, df_2,how='right', on="userid")
#userid	age	payment
#0	a	23.0	2000
#1	c	32.0	3500
#2	e	NaN	600

pd.merge(df_1, df_2,how='outer',on='userid')
#userid	age	payment
#0	a	23.0	2000.0
#1	b	46.0	NaN
#2	c	32.0	3500.0
#3	d	19.0	NaN
#4	e	NaN	600.0

DataFrame.set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False)

df = pd.DataFrame({'month': [1, 4, 7, 10],
                   'year': [2012, 2014, 2013, 2014],
                   'sale': [55, 40, 84, 31]})
df
#	month	year	sale
#0	1	2012	55
#1	4	2014	40
#2	7	2013	84
#3	10	2014	31

df.set_index('month')
	year	sale
month		
#1	2012	55
#4	2014	40
#7	2013	84
#10	2014	31

df.set_index('month',drop=False)
#   month	year	sale
#month			
#1	1	2012	55
#4	4	2014	40
#7	7	2013	84
#10	10	2014	31

df.set_index('month', append=True)
df.loc[0]
#month       1
#year     2012
#sale       55
#Name: 0, dtype: int64

df.set_index('month', inplace=True)
df
#	year	sale
#month		
#1	2012	55
#4	2014	40
#7	2013	84
#10	2014	31

df.set_index(pd.Series(range(4)))
#year	sale
#0	2012	55
#1	2014	40
#2	2013	84
#3	2014	31

DataFrame.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)

df = pd.DataFrame({
    'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating': [4, 4, 3.5, 15, 5]
})
df
#brand	style	rating
#0	Yum Yum	cup	4.0
#1	Yum Yum	cup	4.0
32	Indomie	cup	3.5
#3	Indomie	pack	15.0
#4	Indomie	pack	5.0

df.drop_duplicates()
#brand	style	rating
#0	Yum Yum	cup	4.0
#2	Indomie	cup	3.5
#3	Indomie	pack	15.0
#4	Indomie	pack	5.0

df.drop_duplicates(subset=['brand'])
#brand	style	rating
#0	Yum Yum	cup	4.0
#2	Indomie	cup	3.5

df.drop_duplicates(subset=['brand', 'style'], keep='last')
#brand	style	rating
#1	Yum Yum	cup	4.0
#2	Indomie	cup	3.5
#4	Indomie	pack	5.0

df.index
#RangeIndex(start=0, stop=5, step=1)

df.index.tolist()
#[0, 1, 2, 3, 4]

import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt

data = pd.read_csv('爱奇艺视频数据.csv',encoding="gbk")
data.info()
#<class 'pandas.core.frame.DataFrame'>
#RangeIndex: 99999 entries, 0 to 99998
#Data columns (total 24 columns):
# #   Column  Non-Null Count  Dtype  
#---  ------  --------------  -----  
# 0   数据获取日期  99999 non-null  object 
# 1   演员      97981 non-null  object 
# 2   视频ID    99999 non-null  object 
# 3   详细链接    99998 non-null  object 
# 4   剧名      99999 non-null  object 
# 5   状态      99158 non-null  object 
# 6   类型      99999 non-null  object 
# 7   来源平台    99999 non-null  object 
# 8   整理后剧名   99999 non-null  object 
# 9   更新时间    644 non-null    object 
# 10  上映时间    78755 non-null  float64
# 11  语言      85926 non-null  object 
# 12  评分      99970 non-null  float64
# 13  地区      98728 non-null  object 
# 14  上映年份    78755 non-null  float64
# 15  简介      99970 non-null  object 
# 16  导演      97614 non-null  object 
# 17  差评数     99970 non-null  float64
# 18  评分人数    99970 non-null  float64
# 19  播放量     99453 non-null  float64
# 20  更新至     1272 non-null   float64
# 21  总集数     98871 non-null  float64
# 22  第几季     99999 non-null  int64  
# 23  好评数     99970 non-null  float64
#dtypes: float64(9), int64(1), object(14)
#memory usage: 18.3+ MB

pd.read_csv(sio, dtype={"user_id": int, "username": object})

pd.read_csv(sio, low_memory=False})

data.head(3)

data.columns
#Index(['数据获取日期', '演员', '视频ID', '详细链接', '剧名', '状态', '类型', '来源平台', '整理#后剧名',
#       '更新时间', '上映时间', '语言', '评分', '地区', '上映年份', '简介', '导演', '差评数', #'评分人数',
#       '播放量', '更新至', '总集数', '第几季', '好评数'],
#      dtype='object')

data.groupby('导演')[['好评数','评分人数']].sum()
#好评数	评分人数
#导演		
#Exact	375172.0	458543.0
#John Fawcett Steve Dimarco Paul Fox	1477942.0	1729878.0
#Michael Cuesta	527348.0	604104.0
#Michael Dinner	1032245.0	1312847.0
#Michael Engler	47804.0	61844.0
#...	...	...
#龚朝	4634.0	8620.0
#龚朝/杨巧文/王伟仁	676160.0	964912.0
#龚朝晖	4044245.0	5941895.0
#龚艺群	194079.0	290358.0
#龚若飞	29126.0	43151.0
#1196 rows × 2 columns

df_q1 = data.groupby('导演').sum()[['好评数','评分人数']]
df_q1['好评率'] = df_q1['好评数']/df_q1['评分人数']
df_q1
#好评数	评分人数	好评率
#导演			
#Exact	375172.0	458543.0	0.818183
#John Fawcett Steve Dimarco Paul Fox	1477942.0	1729878.0	0.854362
#Michael Cuesta	527348.0	604104.0	0.872942
#Michael Dinner	1032245.0	1312847.0	0.786265
#Michael Engler	47804.0	61844.0	0.772977
#...	...	...	...
#龚朝	4634.0	8620.0	0.537587
#龚朝/杨巧文/王伟仁	676160.0	964912.0	0.700748
#龚朝晖	4044245.0	5941895.0	0.680632
#龚艺群	194079.0	290358.0	0.668413
#龚若飞	29126.0	43151.0	0.674979
#1196 rows × 3 columns

result_q1 = df_q1.sort_values('好评率',ascending=False)[:20]
result_q1

# 设置中文:
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 中文负号
plt.rcParams['axes.unicode_minus'] = False

# 设置分别率 为100
plt.rcParams['figure.dpi'] = 100
# 设置大小
plt.rcParams['figure.figsize'] = (10,3)
# 绘制图形
plt.bar(result_q1.index,result_q1['好评率'])
# 设置y轴范围
plt.ylim(0.98,1)
# 设置x轴文字倾斜
plt.xticks(rotation=70)
# 设置网格
plt.grid(True, linestyle='--')

result_q1['好评率'].plot(kind='bar',
       color = 'b',
       width = 0.8,
       alpha = 0.4,
       rot = 45,
       grid = True,
       ylim = [0.98,1],
       figsize = (12,4),
       title = '不同导演电影的好评率')

movie_year = data.groupby('上映年份')[['评分人数']].sum()
movie_year_2000 = movie_year.loc[2000:]
plt.stackplot(movie_year_2000.index,movie_year_2000['评分人数'])

movie_title_group = data.groupby('整理后剧名')[['评分人数','好评数']].sum()
result_title = movie_title_group.sort_values('好评数',ascending=False)[:20]
result_title

result_title['好评数'].plot(kind='bar',
       color = 'b',
       width = 0.8,
       alpha = 0.4,
       rot = 45,
       grid = True,
       ylim = [1.3e+08,1.1e+09],
       figsize = (12,4),
       title = '不同剧的好评数')