需要量化软件看这篇文章: 耗时一个月,找遍了全市场,总算给大家找到了两款免费的量化软件!!!一款适合新手,一款适合老手!!!!# 1. 导入库from jqdata import *import pandas as pdimport jiebafrom sklearn.decomposition import LatentDirichletAllocationfrom sklearn.feature_extraction.text import CountVectorizerimport datetime# 2. 取当天新闻,只要前 5 条yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')q = query(finance.CCTV_NEWS.day, finance.CCTV_NEWS.content).filter( finance.CCTV_NEWS.day == yesterday).limit(5)news_jq = finance.run_query(q)news = pd.DataFrame({'content': news_jq['content'].tolist()})print(news.head())# 3. 加载停用词(在线版,无需本地文件)import urllib.request, tempfilewith tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8') as f: urllib.request.urlretrieve( 'https://raw.githubusercontent.com/goto456/stopwords/master/cn_stopwords.txt', f.name ) stopwords = set(line.strip() for line in f)# 4. 分词 + 去停用词tokens = []for text in news['content']: words = [w for w in jieba.cut(text) if w not in stopwords and len(w) > 1] tokens.append(' '.join(words))# 5. 添加到 DataFramenews['tokens'] = tokens# 6. 训练 LDAvec = CountVectorizer()X = vec.fit_transform(news['tokens'])lda = LatentDirichletAllocation(n_components=3, random_state=42)lda.fit(X)# 7. 打印主题def print_top_words(model, feature_names, n_top=8): for idx, topic in enumerate(model.components_): top = [feature_names[i] for i in topic.argsort()[-n_top:][::-1]] print(f'主题 {idx+1}: {" / ".join(top)}')print_top_words(lda, vec.get_feature_names())
Copyright © 2024-2025 成都宁时科技有限公司 版权所有