手把手教你爬取50W基金贴吧数据,并做投资者情绪分析!

网页分析
http://guba.eastmoney.com/list,of161725.html

http://guba.eastmoney.com/list,of161725_2.html
数据爬取
import csvimport timeimport randomimport requestsimport tracebackfrom time import sleepfrom fake_useragent import UserAgentfrom lxml import etree
page = 1 #设置爬取的页数fundcode = 161725 #可替换任意基金代码sleep(random.uniform(1, 2)) #随机出现1-2之间的数,包含小数headers = {"User-Agent":UserAgent(verify_ssl=False).random}url = f'http://guba.eastmoney.com/list,of{fundcode}_{page}.html'response = requests.get(url, headers=headers, timeout=10)print(reponse)

parse = etree.HTML(response.text)items = parse.xpath('//*[@id="articlelistnew"]/div')[1:91]for item in items:item = {'阅读': ''.join(item.xpath('./span[1]/text()')).strip(),'评论': ''.join(item.xpath('./span[2]/text()')).strip(),'标题': ''.join(item.xpath('./span[3]/a/text()')).strip(),'作者': ''.join(item.xpath('./span[4]/a/font/text()')).strip(),'时间': ''.join(item.xpath('./span[5]/text()')).strip()}print(item)
with open(f'./{fundcode}.csv', 'a', encoding='utf_8_sig', newline='') as fp:fieldnames = ['阅读', '评论', '标题', '作者', '时间']writer = csv.DictWriter(fp, fieldnames)writer.writerow(item)
# 主函数def main(page):fundcode = 161725 #可替换任意基金代码url = f'http://guba.eastmoney.com/list,of{fundcode}_{page}.html'html = get_fund(url)parse_fund(html,fundcode)if __name__ == '__main__':for page in range(1,6372): #爬取多页(共6371页)main(page)time.sleep(random.uniform(1, 2))print(f"第{page}页提取完成")
投资者情绪
import pandas as pdimport numpy as npdf = pd.read_csv("/菜J学Python/金融/天天基金/161725.csv",names=['阅读', '评论', '标题', '作者', '时间'])
#重复和缺失数据df = df.drop_duplicates()df = df.dropna()#数据类型转换df['阅读'] = df['阅读'].str.replace('万','').astype('float')df['时间'] = pd.to_datetime(df['时间'],errors='ignore')#机械压缩去重def yasuo(st):for i in range(1,int(len(st)/2)+1):for j in range(len(st)):if st[j:j+i] == st[j+i:j+2*i]:k = j + iwhile st[k:k+i] == st[k+i:k+2*i] and k<len(st):k = k + ist = st[:j] + st[k:]return styasuo(st="J哥J哥J哥J哥J哥")df["标题"] = df["标题"].apply(yasuo)#过滤表情df['标题'] = df['标题'].str.extract(r"([\u4e00-\u9fa5]+)")df = df.dropna() #纯表情直接删除#过滤短句df = df[df["标题"].apply(len)>=3]df = df.dropna()
import jiebaimport stylecloudfrom IPython.display import Image# 绘制词云图text1 = get_cut_words(content_series=df['标题'])stylecloud.gen_stylecloud(text=' '.join(text1), max_words=200,collocations=False,font_path='simhei.ttf',icon_name='fas fa-heart',size=653,#palette='matplotlib.Inferno_9',output_name='./基金.png')Image(filename='./基金.png')

import paddlehub as hubsenta = hub.Module(name="senta_bilstm")texts = df['标题'].tolist()input_data = {'text':texts}res = senta.sentiment_classify(data=input_data)df['投资者情绪'] = [x['positive_probs'] for x in res]
#重采样至15分钟df['时间'] = pd.to_datetime(df['时间'])df.index = df['时间']data = df.resample('15min').mean().reset_index()
import akshare as akimport matplotlib.pyplot as pltsz_index = ak.stock_zh_a_minute(symbol='sh000001', period='15', adjust="qfq")sz_index['日期'] = pd.to_datetime(sz_index['day'])sz_index['收盘价'] = sz_index['close'].astype('float')data = data.merge(sz_index,left_on='时间',right_on='日期',how='inner')matplotlib.use('Qt5Agg')data.index = data['时间']data[['投资者情绪','收盘价']].plot(secondary_y=['close'])plt.show()


评论
