对数据集“Netflix电影电视剧及用户观影数据“的分析处理和可视化

对数据集“Netflix电影电视剧及用户观影数据“的分析处理和可视化

一、寻找数据集

from kaggle:《Netflix Movies and TV Shows》 -------- Shivam Bansal

二、数据集分析

1、首先,通过pandas模块导入csv包

import pandas as pd

data = pd.read_csv('movie_data.csv')

In [3] data #数据内容

num_critic_for_reviews duration gross genres num_voted_users num_user_for_reviews language country budget title_year imdb_score

0 723.0 178.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi 886204 3054.0 English USA 237000000.0 2009.0 7.9

1 302.0 169.0 309404152.0 Action|Adventure|Fantasy 471220 1238.0 English USA 300000000.0 2007.0 7.1

2 602.0 148.0 200074175.0 Action|Adventure|Thriller 275868 994.0 English UK 245000000.0 2015.0 6.8

3 813.0 164.0 448130642.0 Action|Thriller 1144337 2701.0 English USA 250000000.0 2012.0 8.5

4 NaN NaN NaN Documentary 8 NaN NaN NaN NaN NaN 7.1

... ... ... ... ... ... ... ... ... ... ... ...

5038 1.0 87.0 NaN Comedy|Drama 629 6.0 English Canada NaN 2013.0 7.7

5039 43.0 43.0 NaN Crime|Drama|Mystery|Thriller 73839 359.0 English USA NaN NaN 7.5

5040 13.0 76.0 NaN Drama|Horror|Thriller 38 3.0 English USA 1400.0 2013.0 6.3

5041 14.0 100.0 10443.0 Comedy|Drama|Romance 1255 9.0 English USA NaN 2012.0 6.3

5042 43.0 90.0 85222.0 Documentary 4285 84.0 English USA 1100.0 2004.0 6.6

5043 rows × 11 columns

2、然后我们首先处理数据集中IMDB电影评分的数据:

In [4] score1,score2,score3,score4=0,0,0,0

for i in range(5043):

if(data.imdb_score[i]<5):

score1=score1+1;

elif(data.imdb_score[i]>5 and data.imdb_score[i]<7):

score2=score2+1;

elif (data.imdb_score[i]>7 and data.imdb_score[i] < 9):

score3 = score3 + 1;

else :

score4 = score4 + 1;

导入绘图包,这里我使用的是matplotlib

import matplotlib.pyplot as plt

labels = '0-5', '5-7', '7-9', '>9' #定义各板块名称

sizes = score1,score2,score3,score4 #各板块数据

colors = 'yellowgreen', 'gold', 'lightskyblue', 'lightcoral' # 设定颜色

explode = 0, 0.1, 0, 0 #板块间的间隙

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=50)

plt.axis('equal')

plt.show()

3、再统计1987-2020的中国电影,并做成折线图

from pandas import Series,DataFrame

import numpy as np

x = np.arange(1987,2020) #定义折线图X坐标

y=[0]*33 #定义折线图y坐标

k=0

for i in data.country: #遍历数据集中的国家项

if(i=='China'):

m=int(data.title_year[k])-1987

y[m]=y[m]+1

k=k+1

plt.figure(figsize=(10, 4), dpi=100) #定义画布

plt.plot(x, y)

plt.show() #显示图像

4、接下里统计各国电影的数量

mpl.rcParams["font.sans-serif"] = ["SimHei"]

mpl.rcParams["axes.unicode_minus"] = False #解决中文显示问题

plt.figure(figsize=(8,6))

labels=list(data.country.unique()) #使用pandas内置函数进行分类计数

fracs=[]

for i in labels:

fracs.append(data.loc[data.country==i].shape[0]) #labels列表存的是国家,fracs列表存的是对应国家的个数

导入画世界地图所需的包

from pyecharts import options as opts

from pyecharts.charts import Map,Geo

import os

作图

data = []

for index in range(len(labels)):

city_ionfo=[labels[index],fracs[index]]

data.append(city_ionfo)

c = (

Map()

.add("Netflix全球电影分布",data, "world")

.set_series_opts(label_opts=opts.LabelOpts(is_show=False))

.set_global_opts(

title_opts=opts.TitleOpts(),

visualmap_opts=opts.VisualMapOpts(max_=200) #因为美国和其他国家的数量差距过大,所以这里将最大值设为200(美国为3807),方便观察。

)

)

c.render_notebook() #将图片在jupyter中显示出来

# os.system("render.html") # 用html打开

5、我们再来统计各类电影的占比

PS:这里我采用依次计数的方式,有更好的方法请告诉我。。。。

action,adventure,fantasy,sciencefiction,mystery,family,thriller,documentary,romance,comedy,animation,musical,western,history,drama,crime=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

for i in data.genres:

if("Action" in i):

action=action+1;

if("Adventure" in i):

adventure=adventure+1;

if("Fantasy" in i):

fantasy=fantasy+1;

if("Sci-Fi" in i):

sciencefiction=sciencefiction+1;

if("Mystery" in i):

mystery=mystery+1;

if("Family" in i):

family=family+1;

if("Thriller" in i):

thriller=thriller+1;

if("Documentary" in i):

documentary=documentary+1;

if("Romance" in i):

romance=romance+1;

if("Comedy" in i):

comedy=comedy+1;

if("Animation" in i):

animation=animation+1;

if("Musical" in i):

musical=musical+1;

if("Western" in i):

western=western+1;

if("History" in i):

history=history+1;

if("Drama" in i):

drama=drama+1;

if("Crime" in i):

crime=crime+1;

print(action,adventure,fantasy,sciencefiction,mystery,family,thriller,documentary,romance,comedy,animation,musical,western,history,drama,crime)

out:1153 923 610 616 500 546 1411 121 1107 1872 242 132 97 207 2594 889

print(" 电影类型饼状图") #输出饼状图,同上

labels ='action','adventure','fantasy','sciencefiction','mystery','family','thriller','documentary','romance','comedy','animation','musical','western','history','drama','crime'

sizes = action,adventure,fantasy,sciencefiction,mystery,family,thriller,documentary,romance,comedy,animation,musical,western,history,drama,crime

colors = 'yellowgreen', 'gold', 'lightskyblue', 'lightcoral','yellowgreen', 'gold', 'lightskyblue', 'lightcoral','yellowgreen', 'gold', 'lightskyblue', 'lightcoral','yellowgreen', 'gold', 'lightskyblue', 'lightcoral'

explode = 0, 0, 0, 0,0, 0, 0, 0,0, 0, 0, 0,0, 0, 0, 0

plt.pie(sizes, radius=2.5,explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=50)

plt.show()

5、再我们统计netflix上电影的预算以及对应的总票房和观众的认可度

data=data.loc[data.budget.notnull()] #剔除掉数据集中budget属性为空的元组

y1,y2=[],[]

for i in range(100,125):

y1.append(data.budget[i])

y2.append(data.gross[i])

x = np.arange(25) #定义折线图X坐标

plt.figure(figsize=(8,4)) #定义画布

plt.plot(x, y1, '.-',label='预算/投入') #添加第一条折线到图中

plt.plot(x, y2, '.-',label='票房/收入')

plt.legend()

plt.xlabel('个数')

plt.ylabel('/十亿美元')

plt.ylim((0,1000000000))

plt.title('Netflix电影的预算以及对应的总票房')

plt.show()

6、最后我们分析数据集中,观众点赞数的数据

①总数据集观众点赞数饼状图分布

data = pd.read_csv('movie_metadata.csv')

score1,score2,score3,score4,score5=0,0,0,0,0

for i in range(5043):

if(data.num_voted_users[i]<2000):

score1=score1+1;

elif(data.num_voted_users[i]>2000 and data.num_voted_users[i]<10000):

score2=score2+1;

elif (data.num_voted_users[i]>10000 and data.num_voted_users[i] <20000):

score3 = score3 + 1;

elif (data.num_voted_users[i]>20000 and data.num_voted_users[i] <50000):

score4 = score4 + 1;

elif(data.num_voted_users[i]>50000):

score5 = score5 + 1;

labels1 ='2千以下', '2千-1万','1万-2万', '2万-5万', '5万以上'

sizes = score1,score2,score3,score4,score5

colors = 'yellowgreen', 'gold', 'lightskyblue', 'lightcoral','gold'

explode = 0, 0, 0, 0,0

plt.pie(sizes, explode=explode, labels=labels1, colors=colors, autopct='%1.1f%%', shadow=True, startangle=50)

plt.axis('equal')

plt.title('观众点赞饼状图')

plt.show()

②从数据集中随机抽取100个数据作成散点图

import random

list,d2=[],[]

for i in range(100):

list.append(random.randint(1, 4551))

for i in list:

d2.append(data.num_user_for_reviews[i])

d1 = np.random.randn(100)

plt.scatter(d1,d2)

plt.title("观众点赞/投票数散点图")

数据集分析完毕

Copyright © 2088 世界杯预选赛中国_1994年世界杯冠军是谁 - nywk120.com All Rights Reserved.
友情链接
Top