turftopic/paper.bib at main · x-tabdeveloping/turftopic · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
@misc{s3,
      title={$S^3$ -- Semantic Signal Separation},
      author={Márton Kardos and Jan Kostkan and Arnault-Quentin Vermillet and Kristoffer Nielbo and Kenneth Enevoldsen and Roberta Rocca},
      year={2024},
      eprint={2406.09556},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2406.09556},
      doi = "10.48550/arXiv.2406.09556",
}

@inproceedings{keynmf,
    title = "Context is Key(NMF):: Modelling Topical Information Dynamics in {C}hinese Diaspora Media",
    abstract = "Does the People{\textquoteright}s Republic of China (PRC) interfere with European elections through ethnic Chinese diaspora media? This question forms the basis of an ongoing research project exploring how PRC narratives about European elections are represented in Chinese diaspora media, and thus the objectives of PRC news media manipulation. In order to study diaspora media ef{\"I}ciently and at scale, it is necessary to use techniques derived from quantitative text analysis, such as topic modelling. In this paper, we present a pipeline for studying information dynamics in Chinese media. Firstly, we present KeyNMF, a new approach to static and dynamic topic modelling using transformer-based contextual embedding models. We provide benchmark evaluations to demonstrate that our approach is competitive on a number of Chinese datasets and metrics. Secondly, we integrate KeyNMF with existing methods for describing information dynamics in complex systems. We apply this pipeline to data from five news sites, focusing on the period of time leading up to the 2024 European parliamentary elections. Our methods and results demonstrate the effectiveness of KeyNMF for studying information dynamics in Chinese media and lay groundwork for further work addressing the broader research questions.",
    keywords = "Chinese, contextual topic models, information dynamics, keywords, novelty",
    author = "Kristensen-McLachlan, {Ross Deans} and Hicke, {Rebecca Marie Matouschek} and M{\'a}rton Kardos and Mette Thun{\o}",
    year = "2024",
    month = dec,
    language = "English",
    volume = "3834",
    series = "CEUR Workshop Proceedings",
    publisher = "CEUR-WS",
    pages = "829--847",
    doi="10.48550/arXiv.2410.12791",
    editor = "Haverals, {Wouter } and Koolen, {Marijn } and Thompson, {Laure }",
    booktitle = "Proceedings of the Computational Humanities Research Conference 2024",
    address = "Germany",
}

@article{bertopic_paper,
  title={BERTopic: Neural topic modeling with a class-based TF-IDF procedure},
  author={Grootendorst, Maarten},
  journal={arXiv preprint arXiv:2203.05794},
  doi="10.48550/arXiv.2203.05794",
  year={2022}
}

@inproceedings{topmost,
    title = "Towards the {T}op{M}ost: A Topic Modeling System Toolkit",
    author = "Wu, Xiaobao  and
      Pan, Fengjun  and
      Luu, Anh Tuan",
    editor = "Cao, Yixin  and
      Feng, Yang  and
      Xiong, Deyi",
    booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.acl-demos.4/",
    doi = "10.18653/v1/2024.acl-demos.4",
    pages = "31--41",
    abstract = "Topic models have a rich history with various applications and have recently been reinvigorated by neural topic modeling. However, these numerous topic models adopt totally distinct datasets, implementations, and evaluations. This impedes quick utilization and fair comparisons, and thereby hinders their research progress and applications. To tackle this challenge, we in this paper propose a Topic Modeling System Toolkit (TopMost). Compared to existing toolkits, TopMost stands out by supporting more extensive features. It covers a broader spectrum of topic modeling scenarios with their complete lifecycles, including datasets, preprocessing, models, training, and evaluations. Thanks to its highly cohesive and decoupled modular design, TopMost enables rapid utilization, fair comparisons, and flexible extensions of diverse cutting-edge topic models. Our code, tutorials, and documentation are available at https://github.com/bobxwu/topmost."
}

@article{quantitative_text_analysis, title={Quantitative text analysis}, volume={4}, url={https://www.nature.com/articles/s43586-024-00302-w#citeas}, DOI={10.1038/s43586-024-00302-w}, number={1}, journal={Nature Reviews Methods Primers}, author={Nielbo, Kristoffer L. and Karsdorp, Folgert and Wevers, Melvin and Lassche, Alie and Baglini, Rebekah B. and Kestemont, Mike and Tahmasebi, Nina}, year={2024}, month=apr }

@inproceedings{stream,
    title = "{STREAM}: Simplified Topic Retrieval, Exploration, and Analysis Module",
    author = {Thielmann, Anton  and
      Reuter, Arik  and
      Weisser, Christoph  and
      Kant, Gillian  and
      Kumar, Manish  and
      S{\"a}fken, Benjamin},
    editor = "Ku, Lun-Wei  and
      Martins, Andre  and
      Srikumar, Vivek",
    booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.acl-short.41/",
    doi = "10.18653/v1/2024.acl-short.41",
    pages = "435--444",
    abstract = "Topic modeling is a widely used technique to analyze large document corpora. With the ever-growing emergence of scientific contributions in the field, non-technical users may often use the simplest available software module, independent of whether there are potentially better models available. We present a Simplified Topic Retrieval, Exploration, and Analysis Module (STREAM) for user-friendly topic modelling and especially subsequent interactive topic visualization and analysis. For better topic analysis, we implement multiple intruder-word based topic evaluation metrics. Additionally, we publicize multiple new datasets that can extend the so far very limited number of publicly available benchmark datasets in topic modeling. We integrate downstream interpretable analysis modules to enable users to easily analyse the created topics in downstream tasks together with additional tabular information.The code is available at the following link: https://github.com/AnFreTh/STREAM"
}

@inproceedings{ctm,
    title = "Pre-training is a Hot Topic: Contextualized Document Embeddings Improve Topic Coherence",
    author = "Bianchi, Federico  and
      Terragni, Silvia  and
      Hovy, Dirk",
    editor = "Zong, Chengqing  and
      Xia, Fei  and
      Li, Wenjie  and
      Navigli, Roberto",
    booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
    month = aug,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.acl-short.96",
    doi = "10.18653/v1/2021.acl-short.96",
    pages = "759--766",
    abstract = "Topic models extract groups of words from documents, whose interpretation as a topic hopefully allows for a better understanding of the data. However, the resulting word groups are often not coherent, making them harder to interpret. Recently, neural topic models have shown improvements in overall coherence. Concurrently, contextual embeddings have advanced the state of the art of neural models in general. In this paper, we combine contextualized representations with neural topic models. We find that our approach produces more meaningful and coherent topics than traditional bag-of-words topic models and recent neural models. Our results indicate that future improvements in language models will translate into better topic models.",
}

@inproceedings{zeroshot_tm,
    title = "Cross-lingual Contextualized Topic Models with Zero-shot Learning",
    author = "Bianchi, Federico  and
      Terragni, Silvia  and
      Hovy, Dirk  and
      Nozza, Debora  and
      Fersini, Elisabetta",
    editor = "Merlo, Paola  and
      Tiedemann, Jorg  and
      Tsarfaty, Reut",
    booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
    month = apr,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.eacl-main.143",
    doi = "10.18653/v1/2021.eacl-main.143",
    pages = "1676--1683",
    abstract = "Many data sets (e.g., reviews, forums, news, etc.) exist parallelly in multiple languages. They all cover the same content, but the linguistic differences make it impossible to use traditional, bag-of-word-based topic models. Models have to be either single-language or suffer from a huge, but extremely sparse vocabulary. Both issues can be addressed by transfer learning. In this paper, we introduce a zero-shot cross-lingual topic model. Our model learns topics on one language (here, English), and predicts them for unseen documents in different languages (here, Italian, French, German, and Portuguese). We evaluate the quality of the topic predictions for the same document in different languages. Our results show that the transferred topics are coherent and stable across languages, which suggests exciting future research directions.",
}


@misc{top2vec,
      title={Top2Vec: Distributed Representations of Topics},
      author={Dimo Angelov},
      year={2020},
      eprint={2008.09470},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      doi="10.48550/arXiv.2008.09470",
}

@article{scikit-learn,
 title={Scikit-learn: Machine Learning in {P}ython},
 author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
 journal={Journal of Machine Learning Research},
 volume={12},
 pages={2825--2830},
 year={2011}
}


@inproceedings{fastopic,
    title={FASTopic: Pretrained Transformer is a Fast, Adaptive, Stable, and Transferable Topic Model},
    author={Wu, Xiaobao and Nguyen, Thong Thanh and Zhang, Delvin Ce and Wang, William Yang and Luu, Anh Tuan},
    booktitle={The Thirty-eighth Annual Conference on Neural Information Processing Systems},
    doi="10.48550/arXiv.2405.17978",
    year={2024}
}

@inproceedings{sentence_transformers,
    title = "Sentence-{BERT}: Sentence Embeddings using {S}iamese {BERT}-Networks",
    author = "Reimers, Nils  and
      Gurevych, Iryna",
    editor = "Inui, Kentaro  and
      Jiang, Jing  and
      Ng, Vincent  and
      Wan, Xiaojun",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
    month = nov,
    year = "2019",
    address = "Hong Kong, China",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/D19-1410/",
    doi = "10.18653/v1/D19-1410",
    pages = "3982--3992",
    abstract = "BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has set a new state-of-the-art performance on sentence-pair regression tasks like semantic textual similarity (STS). However, it requires that both sentences are fed into the network, which causes a massive computational overhead: Finding the most similar pair in a collection of 10,000 sentences requires about 50 million inference computations ({\textasciitilde}65 hours) with BERT. The construction of BERT makes it unsuitable for semantic similarity search as well as for unsupervised tasks like clustering. In this publication, we present Sentence-BERT (SBERT), a modification of the pretrained BERT network that use siamese and triplet network structures to derive semantically meaningful sentence embeddings that can be compared using cosine-similarity. This reduces the effort for finding the most similar pair from 65 hours with BERT / RoBERTa to about 5 seconds with SBERT, while maintaining the accuracy from BERT. We evaluate SBERT and SRoBERTa on common STS tasks and transfer learning tasks, where it outperforms other state-of-the-art sentence embeddings methods."
}

@misc{topicwizard,
      title={topicwizard -- a Modern, Model-agnostic Framework for Topic Model Visualization and Interpretation},
      author={Márton Kardos and Kenneth C. Enevoldsen and Kristoffer Laigaard Nielbo},
      year={2025},
      eprint={2505.13034},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2505.13034},
      doi="10.48550/arXiv.2505.13034"
}

@article{discourse_analysis,
  author = {Thomas Jacobs and Robin Tschötschel and},
  title = {Topic models meet discourse analysis: a quantitative tool for a qualitative approach},
  journal = {International Journal of Social Research Methodology},
  volume = {22},
  number = {5},
  pages = {469--485},
  year = {2019},
  publisher = {Routledge},
  doi = {10.1080/13645579.2019.1576317},
}

@book{macroanalysis,
  ISBN = {9780252037528},
  URL = {http://www.jstor.org/stable/10.5406/j.ctt2jcc3m},
  abstract = {In this volume, Matthew L. Jockers introduces readers to large-scale literary computing and the revolutionary potential of macroanalysis--a new approach to the study of the literary record designed for probing the digital-textual world as it exists today, in digital form and in large quantities. Using computational analysis to retrieve key words, phrases, and linguistic patterns across thousands of texts in digital libraries, researchers can draw conclusions based on quantifiable evidence regarding how literary trends are employed over time, across periods, within regions, or within demographic groups, as well as how cultural, historical, and societal linkages may bind individual authors, texts, and genres into an aggregate literary culture. Moving beyond the limitations of literary interpretation based on the close-reading of individual works, Jockers describes how this new method of studying large collections of digital material can help us to better understand and contextualize the individual works within those collections.},
  author = {MATTHEW L. JOCKERS},
  publisher = {University of Illinois Press},
  title = {Macroanalysis: Digital Methods and Literary History},
  urldate = {2025-05-27},
  year = {2013}
}

@article{hotel_sector, title={Analysing online customer experience in hotel sector using dynamic topic modelling and net promoter score}, volume={14}, url={https://www.emerald.com/insight/content/doi/10.1108/jhtt-04-2021-0116/full/html}, DOI={10.1108/jhtt-04-2021-0116}, number={2}, journal={Journal of Hospitality and Tourism Technology}, author={Nguyen, Van-Ho and Ho, Thanh}, year={2023}, month=feb, pages={258–277} }

@article{social_media_mining, title={Mining social media data via supervised topic model: Can social media posts inform customer satisfaction?}, url={https://onlinelibrary.wiley.com/doi/full/10.1111/deci.12660}, DOI={10.1111/deci.12660}, journal={Decision Sciences}, author={Huang, Yinghui and Li, Mei and Tsung, Fugee and Chang, Xiangyu}, year={2025}, month=jan }

@InProceedings{content_recommendation,
  author="Bergamaschi, Sonia
  and Po, Laura",
  editor="Monfort, Val{\'e}rie
  and Krempels, Karl-Heinz",
  title="Comparing LDA and LSA Topic Models for Content-Based Movie Recommendation Systems",
  booktitle="Web Information Systems and Technologies",
  year="2015",
  publisher="Springer International Publishing",
  address="Cham",
  pages="247--263",
  abstract="We propose a plot-based recommendation system, which is based upon an evaluation of similarity between the plot of a video that was watched by a user and a large amount of plots stored in a movie database. Our system is independent from the number of user ratings, thus it is able to propose famous and beloved movies as well as old or unheard movies/programs that are still strongly related to the content of the video the user has watched. The system implements and compares the two Topic Models, Latent Semantic Allocation (LSA) and Latent Dirichlet Allocation (LDA), on a movie database of two hundred thousand plots that has been constructed by integrating different movie databases in a local NoSQL (MongoDB) DBMS. The topic models behaviour has been examined on the basis of standard metrics and user evaluations, performance assessments with 30 users to compare our tool with a commercial system have been conducted.",
  isbn="978-3-319-27030-2",
  doi={10.1007/978-3-319-27030-2_16},
}

@article{unsupervised_classification,
  author = {Anton Thielmann and Christoph Weisser and Astrid Krenz and Benjamin Säfken and},
  title = {Unsupervised document classification integrating web scraping, one-class SVM and LDA topic modelling},
  journal = {Journal of Applied Statistics},
  volume = {50},
  number = {3},
  pages = {574--591},
  year = {2023},
  publisher = {Taylor \& Francis},
  doi = {10.1080/02664763.2021.1919063},
  note ={PMID: 36819086},
}

@InProceedings{information_retrieval,
  author="Yi, Xing
  and Allan, James",
  editor="Boughanem, Mohand
  and Berrut, Catherine
  and Mothe, Josiane
  and Soule-Dupuy, Chantal",
  title="A Comparative Study of Utilizing Topic Models for Information Retrieval",
  booktitle="Advances in Information Retrieval",
  year="2009",
  publisher="Springer Berlin Heidelberg",
  address="Berlin, Heidelberg",
  pages="29--41",
  abstract="We explore the utility of different types of topic models for retrieval purposes. Based on prior work, we describe several ways that topic models can be integrated into the retrieval process. We evaluate the effectiveness of different types of topic models within those retrieval approaches. We show that: (1) topic models are effective for document smoothing; (2) more rigorous topic models such as Latent Dirichlet Allocation provide gains over cluster-based models; (3) more elaborate topic models that capture topic dependencies provide no additional gains; (4) smoothing documents by using their similar documents is as effective as smoothing them by using topic models; (5) doing query expansion should utilize topics discovered in the top feedback documents instead of coarse-grained topics from the whole corpus; (6) generally, incorporating topics in the feedback documents for building relevance models can benefit the performance more for queries that have more relevant documents.",
  isbn="978-3-642-00958-7",
  doi={10.1007/978-3-642-00958-7_6},
}

@misc{data_mixers,
    title={Unsupervised Topic Models are Data Mixers for Pre-training Language Models},
    author={Jiahui Peng and Xinlin Zhuang and Qiu Jiantao and Ren Ma and Jing Yu and Tianyi Bai and Conghui He},
    year={2025},
    eprint={2502.16802},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2502.16802},
    doi={10.48550/arXiv.2502.16802}
}