model = ClusteringTopicModel(
dimensionality_reduction=UMAP(
n_neighbors=15,
n_components=5,
min_dist=0.0,
metric="cosine",
),
clustering=HDBSCAN(
min_cluster_size=15,
min_samples=7,
metric="euclidean",
cluster_selection_method="eom",
prediction_data=True
),
feature_importance="c-tf-idf",
reduction_method="average",
reduction_distance_metric="cosine",
reduction_topic_representation="component",
)
model.fit(texts)
# Extract dominant topics and topic scores
dominant_topics = []
topic_distributions = []
for text in tqdm(texts):
dist_df = model.topic_distribution_df(text, top_k=len(model.topic_names))
topic_distributions.append(dist_df["Score"].values)
dominant_topic = dist_df.iloc[0]["Topic name"]
dominant_topics.append(dominant_topic)
# Add dominant topic and topic scores to df
df_with_topics = df.copy()
df_with_topics["dominant_topic"] = dominant_topics
topic_prob_df = pd.DataFrame(
topic_distributions,
columns=[f"topic_{i}" for i in range(len(model.topic_names))]
)
df_with_topics = pd.concat([df_with_topics.reset_index(drop=True),
topic_prob_df.reset_index(drop=True)], axis=1)
return model, df_with_topics
model, df_with_topics = turf_topic_clustering(news_opinion, text_column="article_text")
Hi all,
I am using Turftopic on a dataset of news articles and I need the topic probability distribution for each topic. If I only use the topic_distribution method I only get 10 distributions, whereas I need all of the distributions (out of the clustered topics). So I tweaked the default parameter, but the estimated time to process this is abnormous - around 12 hours?
Is there a faster way to do this?
Here's my code