Python分析6625条视频,告诉你“打工人”凭什么能刷爆全网
大数据DT
共 3939字,需浏览 8分钟
·
2020-11-10 22:16
导读:今天教大家用Python分析B站的“打工人”视频。
网络数据获取 数据读入和数据清洗 数据可视化分析
# 导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 读入数据
df = pd.read_excel('./data/B站打工人视频10-28.xlsx')
df.head()
print(df.shape)
(6625, 7)
去除重复值 view_num和danmu:单位转换 筛选数据
def transform_unit(x_col):
"""
功能:转换数值型变量的单位
"""
# 提取数值
s_num = df[x_col].str.extract('(\d+\.*\d*)').astype('float')
# 提取单位
s_unit = df[x_col].str.extract('([\u4e00-\u9fa5]+)')
s_unit = s_unit.replace('万', 10000).replace(np.nan, 1)
s_multiply = s_num * s_unit
return s_multiply
# 去重
df = df.drop_duplicates()
# 删除列
df.drop('video_url', axis=1, inplace=True)
# 转换单位
df['view_num'] = transform_unit(x_col='view_num')
df['danmu'] = transform_unit(x_col='danmu')
# 筛选时间
df = df[(df['upload_time'] >= '2020-09-01') & (df['title'].astype('str').str.contains('打工人'))]
df.head()
import jieba
from pyecharts.charts import Bar, Line, Pie, Map, Scatter, Page
from pyecharts import options as opts
from pyecharts.globals import SymbolType, WarningType
WarningType.ShowWarning = False
time_num = df.upload_time.value_counts().sort_index()
time_num[:5]
2020-09-05 1
2020-09-08 1
2020-09-09 1
2020-09-12 1
2020-09-13 1
Name: upload_time, dtype: int64
# 条形图
line1 = Line(init_opts=opts.InitOpts(width='1350px', height='750px'))
line1.add_xaxis(time_num.index.tolist())
line1.add_yaxis('', time_num.values.tolist(),
markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_='min'),
opts.MarkPointItem(type_='max')])
)
line1.set_global_opts(title_opts=opts.TitleOpts(title='打工人视频发布热度走势图', pos_left='40%'),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate='90')),
visualmap_opts=opts.VisualMapOpts(max_=int(time_num.max()), is_show=False),
)
line1.set_series_opts(linestyle_opts=opts.LineStyleOpts(width=3),
label_opts=opts.LabelOpts(is_show=False)
)
line1.render()
评论