【B 站视频教程】抓取用户微博和批量抓取评论
点击上方 月小水长 并 设为星标,第一时间接收干货推送
如何抓取用户的所有微博,该部分代码地址在: 一个爬取用户所有微博的爬虫,还能断网续爬那种(点击直达),下面的视频详情演示了这个过程
{
"cookie": "换成你的 cookie",
"comments": [
{
"mid": "KCXTUah9W",
"uid": "2656274875",
"limit": 100000,
"decs": "吴京说神州十三号太美了"
},
{
"mid": "KCYA7jubh",
"uid": "2803301701",
"limit": 100000,
"decs": "吴京说神州十三号太美了"
}
]
}
# -*- coding: utf-8 -*-
# author: inspurer(月小水长)
# create_time: 2021/10/17 10:31
# 运行环境 Python3.6+
# github https://github.com/inspurer
# 微信公众号 月小水长
import json
import pandas as pd
limit = 10000
config_path = 'mac_comment_config.json'
data_path = './topic/小米.csv'
def drop_duplicate(path, col_index=0):
df = pd.read_csv(path)
first_column = df.columns.tolist()[col_index]
# 去除重复行数据
df.drop_duplicates(keep='first', inplace=True, subset=[first_column])
# 可能还剩下重复 header
df = df[-df[first_column].isin([first_column])]
df.to_csv(path, encoding='utf-8-sig', index=False)
drop_duplicate(data_path)
with open(config_path, 'r', encoding='utf-8-sig') as f:
config_json = json.loads(f.read())
df = pd.read_csv(data_path)
# 清楚原有的 comments 配置,如不需要可注释
config_json['comments'].clear()
for index, row in df.iterrows():
print(f'{index + 1}/{df.shape[0]}')
weibo_link = row['weibo_link']
if '?' in weibo_link:
weibo_link = weibo_link[:weibo_link.index('?')]
uid = weibo_link[weibo_link.index('com') + 4:weibo_link.rindex('/')]
mid = weibo_link[weibo_link.rindex('/') + 1:]
config_json['comments'].append({
'mid': mid,
'uid': uid,
'limit': limit,
'desc': row['user_name']
})
with open(config_path, 'w', encoding='utf-8-sig') as f:
f.write(json.dumps(config_json, indent=2, ensure_ascii=False))
评论