cvmlmu.github.io/cvmlmu.bib at main · CVMLmu/cvmlmu.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
@inproceedings{AICS2025-Aktas,
title={Benchmarking SAM2-based Trackers on FMOX},
author={Senem Aktas and Charles Markham and John McDonald and Rozenn Dahyot},
booktitle={33rd International Conference on Artificial Intelligence and Cognitive Science (AICS 2025)},
address={Dublin, Ireland},
year={2025},
month={December},
pages={1-12},
doi={10.5281/zenodo.19226612},
url={Paper=https://cvmlmu.github.io/FMOX/FMOXaics2025.pdf ArXiv=https://arxiv.org/abs/2512.09633
Code=https://cvmlmu.github.io/FMOX/ OA=https://openalex.org/w4417291887},
abstract={Several object tracking pipelines extending Segment Anything Model 2 (SAM2) have been proposed in the past year,
where the approach is to follow and segment the object from a single exemplar template provided by the user on a initialization frame.
We propose to benchmark these high performing trackers (SAM2, EfficientTAM, DAM4SAM and SAMURAI) on datasets containing fast moving objects (FMO)
specifically designed to be challenging for tracking approaches. The goal is to understand better current limitations in state-of-the-art trackers by
providing more detailed insights on the behavior of these trackers. We show that overall the trackers DAM4SAM and SAMURAI perform well on more challenging sequences. },
keywords={},
note={},
}

@inproceedings{Aktas2025,
author  =  {Senem Aktas and Charles Markham and John McDonald and Rozenn Dahyot},
title  =  {Benchmarking EfficientTAM on FMO datasets},
booktitle  =  {Irish Machine Vision and Image Processing (IMVIP 2025)},
address =  {Ulster University, Derry-Londonderry, Northern Ireland},
year  =  {2025},
pages=  {59-66},
abstract  =  {Fast and tiny object tracking remains a challenge in computer vision and in this paper we first introduce
a JSON metadata file associated with four open source datasets of Fast Moving Objects (FMOs) image
sequences. In addition, we extend the description of the FMOs datasets with additional ground truth information
in JSON format (called FMOX) with object size information. Finally we use our FMOX file to test
a recently proposed foundational model for tracking (called EfficientTAM) showing that its performance
compares well with the pipelines originally taylored for these FMO datasets. Our comparison of these stateof-
the-art techniques on FMOX is provided with Trajectory Intersection of Union (TIoU) scores. The code
and JSON is shared open source allowing FMOX to be accessible and usable for other machine learning
pipelines aiming to process FMO datasets.},
url  =  {Paper=https://cvmlmu.github.io/FMOX/FMOXimvip2025.pdf ArXiv=https://arxiv.org/abs/2509.06536
Code=https://cvmlmu.github.io/FMOX/ OA=https://openalex.org/w4415056026},
doi  =  {10.21251/8043511f-bf93-4b36-9348-0726af0987f6},
keywords = {Tracking, fast moving objects},
note  =  {},
}


@techreport{DGMMC2024,
title={Performance of Gaussian Mixture Model Classifiers on Embedded Feature Spaces},
author={Jeremy Chopin and Rozenn Dahyot},
year={2024},
eprint={2410.13421},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/pdf/2410.13421},
month={October},
doi={10.48550/arXiv.2410.13421},
abstract={Data embeddings with CLIP and ImageBind provide powerful features for the analysis of multimedia and/or multimodal data. We assess their performance here for classification using a Gaussian Mixture models (GMMs) based layer as an alternative to the standard Softmax layer. GMMs based classifiers have recently been shown to have interesting performances as part of deep learning pipelines trained end-to-end. Our first contribution is to investigate GMM based classification performance taking advantage of the embedded spaces CLIP and ImageBind. Our second contribution is in proposing our own GMM based classifier with a lower parameters count than previously proposed. Our findings are, that in most cases, on these tested embedded spaces, one gaussian component in the GMMs is often enough for capturing each class, and we hypothesize that this may be due to the contrastive loss used for training these embedded spaces that naturally concentrates features together for each class. We also observed that ImageBind often provides better performance than CLIP for classification of image datasets even when these embedded spaces are compressed using PCA.},
}

@inbook{doi:10.1142/9789811289125_0011,
author = {Jeremy   Chopin  and  Jean-Baptiste   Fasquel  and  Harold   Mouchere  and  Rozenn   Dahyot  and  Isabelle   Bloch },
chapter = {Reinforcement Learning and Sequential QAP-Based Graph Matching for Semantic Segmentation of Images},
title = {Emerging Topics in Pattern Recognition and Artificial Intelligence},
publisher = {World Scientific},
pages = {259-294},
year ={2024},
doi = {10.1142/9789811289125_0011},
URL = {https://www.worldscientific.com/doi/abs/10.1142/9789811289125_0011},
eprint = {https://www.worldscientific.com/doi/pdf/10.1142/9789811289125_0011},
abstract = { This chapter addresses the fundamental task of semantic image analysis by exploiting structural information (spatial relationships between image regions). We propose to combine a deep neural network (DNN) with graph matching (formulated as a quadratic assignment problem (QAP)) where graphs encode efficiently structural information related to regions segmented by the DNN. Our novel approach solves the QAP sequentially for matching graphs, in the context of image semantic segmentation, where the optimal sequence for graph matching is conveniently defined using reinforcement learning (RL) based on the region membership probabilities produced by the DNN and their structural relationships. Our RL-based strategy for solving QAP sequentially allows us to significantly reduce the combinatorial complexity for graph matching. Two experiments are performed on two public datasets dedicated respectively to the semantic segmentation of face images and sub-cortical region of the brain. Results show that the proposed RL-based ordering performs better than using a random ordering, especially when using DNNs that have been trained on a limited number of samples. The open-source code and data are shared with the community. }
}


@ARTICLE{SmartHangar2024,
AUTHOR={Luke Casey and John Dooley and Michael Codd and Rozenn Dahyot and Marco Cognetti and Thomas Mullarkey and Peter Redmond and Gerard Lacey},
TITLE={A real-time Digital Twin for active safety in an aircraft hangar},
JOURNAL={Frontiers in Virtual Reality},
VOLUME={5},
YEAR={2024},
URL={https://www.frontiersin.org/articles/10.3389/frvir.2024.1372923},
DOI={10.3389/frvir.2024.1372923},
ISSN={},
ABSTRACT={The aerospace industry prioritises safety protocols to prevent accidents that can result in injuries, fatalities, or aircraft damage. One of the potential hazards that can occur while manoeuvring aircraft in and out of a hangar is collisions with other aircraft or buildings, which can lead to operational disruption and costly repairs. To tackle this issue, we have developed the Smart Hangar project, which aims to alert personnel of increased risks and prevent incidents from happening. The Smart Hangar project uses computer vision, LiDAR, and ultra-wideband sensors to track all objects and individuals within the hangar space. These data inputs are combined to form a real-time 3D Digital Twin (DT) of the hangar environment. The Active Safety system then uses the DT to perform real-time path planning, collision prediction, and safety alerts for tow truck drivers and hangar personnel. This paper provides a detailed overview of the system architecture, including the technologies used, and highlights the system's performance. By implementing this system, we aim to reduce the risk of accidents in the aerospace industry and increase safety for all personnel involved.
 Additionally, we identify future research directions for the Smart Hangar project.}
}

@inproceedings{Panahi2023,
author= {Solmaz Panahi and Jeremy Chopin and Matej Ulicny and Rozenn Dahyot},
title= {Improving  GMM  registration with class encoding},
booktitle= {Irish Machine Vision and Image Processing (IMVIP 2023)},
volume= {},
year= {2023},
abstract={Point set registration is critical in many applications such as  computer vision, pattern recognition, or in fields like robotics and medical imaging.
This paper focuses on reformulating point set registration using Gaussian Mixture Models while considering attributes associated with each point. Our approach introduces class score vectors as additional features
to the spatial data information. By incorporating these attributes, we enhance the optimization process by penalizing incorrect matching terms. Experimental results show that our approach
with class scores outperforms the original algorithm  in both accuracy and speed.},
url= {https://github.com/solmak97/GMMReg_Extension},
doi={10.5281/zenodo.8205096},
note={},

}

@INPROCEEDINGS{KoteyInterSpeech2023,
  author={Samantha Kotey and Rozenn Dahyot and Naomi Harte},
   booktitle={Proc. INTERSPEECH 2023},
  title={Query Based Acoustic Summarization for Podcasts},
  year={2023},
  volume={},
  number={},
 pages={1483--1487},
  abstract={},
  keywords={},
   doi={10.21437/Interspeech.2023-864},
  url={https://www.isca-speech.org/archive/pdfs/interspeech_2023/kotey23_interspeech.pdf},
  ISSN={},
  address={Dublin, Ireland},
  month={August},}

@techreport{ulicny2023combining,
       doi={10.48550/arXiv.2305.08232},
	   url={https://arxiv.org/pdf/2305.08232.pdf},
	   abstract={We propose a pipeline for combined multi-class object geolocation and height estimation from street level RGB imagery, which is considered as a single available input data modality. Our solution is formulated via Markov Random Field optimization with deterministic output. The proposed technique uses image metadata along with coordinates of objects detected in the image plane as found by a custom-trained Convolutional Neural Network. Computing the object height using our methodology, in addition to object geolocation, has negligible effect on the overall computational cost. Accuracy is demonstrated experimentally for water drains and road signs on which we achieve average elevation estimation error lower than 20cm.},
      title={Combining geolocation and height estimation of objects from street level imagery},
      author={Matej Ulicny and Vladimir A. Krylov and Julie Connelly and Rozenn Dahyot},
      year={2023},
      eprint={2305.08232},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{CHOPIN2023103744,
title = {Model-based inexact graph matching on top of DNNs for semantic scene understanding},
journal = {Computer Vision and Image Understanding},
pages = {103744},
year = {2023},
issn = {1077-3142},
doi = {https://doi.org/10.1016/j.cviu.2023.103744},
url = {https://arxiv.org/pdf/2301.07468.pdf},
author = {Jeremy Chopin and Jean-Baptiste Fasquel and Harold Mouchère and Rozenn Dahyot and Isabelle Bloch},
keywords = {Graph matching, Deep learning, Image segmentation, Volume segmentation, Quadratic assignment problem},
abstract = {Deep learning based pipelines for semantic segmentation often ignore structural information available on annotated images used for training. We propose a novel post-processing module enforcing structural knowledge about the objects of interest to improve segmentation results provided by deep neural networks (DNNs). This module corresponds to a “many-to-one-or-none” inexact graph matching approach, and is formulated as a quadratic assignment problem. Our approach is compared to a DNN-based segmentation on two public datasets, one for face segmentation from 2D RGB images (FASSEG), and the other for brain segmentation from 3D MRIs (IBSR). Evaluations are performed using two types of structural information: distances and directional relations that are user defined, this choice being a hyper-parameter of our proposed generic framework. On FASSEG data, results show that our module improves accuracy of the DNN by about 6.3% i.e. the Hausdorff distance (HD) decreases from 22.11 to 20.71 on average. With IBSR data, the improvement is of 51% better accuracy with HD decreasing from 11.01 to 5.4. Finally, our approach is shown to be resilient to small training datasets that often limit the performance of deep learning methods: the improvement increases as the size of the training dataset decreases.}
}

@techreport{Dahyot_PCC2022,

  author = {Dahyot, Rozenn},

  keywords = {Supervised Learning, PCA, classification, metric learning, deep learning, class encoding},
  abstract={We propose to directly compute classification estimates
by learning features encoded with their class scores.
Our resulting model has a encoder-decoder structure suitable for supervised learning, it is computationally efficient and performs well for classification on several datasets.},

  title = {Principal Component Classification},

  publisher = {arXiv},

  year = {2022},
   doi = {10.48550/ARXIV.2210.12746},

  url = {https://arxiv.org/pdf/2210.12746.pdf},

  copyright = {Creative Commons Attribution 4.0 International},
}

@INPROCEEDINGS{KoteySLT2023,
  author={Kotey, Samantha and Dahyot, Rozenn and Harte, Naomi},
  booktitle={2022 IEEE Spoken Language Technology Workshop (SLT)},
  title={Fine Grained Spoken Document Summarization Through Text Segmentation},
  year={2023},
  volume={},
  number={},
  pages={647-654},
  abstract={Podcast transcripts are long spoken documents of conversational dialogue. Challenging to summarize, podcasts cover a diverse range of topics, vary in length, and have uniquely different linguistic styles. Previous studies in podcast summarization have generated short, concise dialogue summaries. In contrast, we propose a method to generate long fine-grained summaries, which describe details of sub-topic narratives. Leveraging a readability formula, we curate a data subset to train a long sequence transformer for abstractive summarization. Through text segmentation, we filter the evaluation data and exclude specific segments of text. We apply the model to segmented data, producing different types of fine grained summaries. We show that appropriate filtering creates comparable results on ROUGE and serves as an alternative method to truncation. Experiments show our model outperforms previous studies on the Spotify podcast dataset when tasked with generating longer sequences of text.},
  keywords={},
  doi={10.1109/SLT54892.2023.10022829},
  url={https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10022829},
  ISSN={},
  month={Jan},}


@article{ULICNY2022108707,
author= {Matej Ulicny and Vladimir A. Krylov and Rozenn Dahyot},
title= {Harmonic Convolutional Networks based on Discrete Cosine Transform},
journal={Pattern Recognition},
abstract={Convolutional neural networks (CNNs) learn filters in order to capture local correlation patterns in feature space. We propose to learn these filters as combinations of preset spectral filters defined by the Discrete Cosine Transform (DCT). Our proposed DCT-based harmonic blocks replace conventional convolutional layers to produce partially or fully harmonic versions of new or existing CNN architectures. Using DCT energy compaction properties, we demonstrate how the harmonic networks can be efficiently compressed by truncating high-frequency information in harmonic blocks thanks to the redundancies in the spectral domain. We report extensive experimental validation demonstrating benefits of the introduction of harmonic blocks into state-of-the-art CNN models in image classification, object detection and semantic segmentation applications.},
volume= {129},
pages={1-12},
year= {2022},
issn = {0031-3203},
url= {https://arxiv.org/pdf/2001.06570.pdf},
doi={10.1016/j.patcog.2022.108707},
note={arXiv.2001.06570  Github: https://github.com/matej-ulicny/harmonic-networks},
}
,
@inproceedings{ChopinICPRAI2022a,
title={Improving semantic segmentation with graph-based structural knowledge},
author={J. Chopin and J.-B. Fasquel and H. Mouchere and R. Dahyot and I. Bloch},
abstract={Deep learning based pipelines for semantic segmentation often
ignore structural information available on annotated images used for
training. We propose a novel post-processing module enforcing structural
knowledge about the objects of interest to improve segmentation
results provided by deep learning. This module corresponds to a “manyto-
one-or-none” inexact graph matching approach, and is formulated as
a quadratic assignment problem. Using two standard measures for evaluation,
we show experimentally that our pipeline for segmentation of
3D MRI data of the brain outperforms the baseline CNN (U-Net) used
alone. In addition, our approach is shown to be resilient to small training
datasets that often limit the performance of deep learning.},
doi={10.1007/978-3-031-09037-0_15},
url= {https://hal.inria.fr/hal-03633029},
note={hal-03633029},
booktitle={Pattern Recognition and Artificial Intelligence},
year={2022},
publisher={Springer International Publishing},
editor={El Yacoubi, Moun{\^i}m
and Granger, Eric
and Yuen, Pong Chi
and Pal, Umapada
and Vincent, Nicole},
month={June},
HAL_ID = {hal-03633029},
address={Paris, France},
isbn={978-3-031-09037-0},
pages={173--184},
},


@inproceedings{ChopinICPRAI2022b,
title={QAP Optimisation with Reinforcement Learning for Faster Graph Matching in Sequential Semantic Image Analysis},
author={J. Chopin and J.-B. Fasquel and H. Mouchere and R. Dahyot and I. Bloch},
abstract={The paper addresses the fundamental task of semantic image
analysis by exploiting structural information (spatial relationships
between image regions). We propose to perform such semantic image
analysis by combining a deep neural network (CNN) with graph matching
where graphs encode efficiently structural information related to regions
segmented by the CNN. Our novel approach solves the quadratic assignment
problem (QAP) sequentially for matching graphs. The optimal
sequence for graph matching is conveniently defined using reinforcementlearning
(RL) based on the region membership probabilities produced by
the CNN and their structural relationships. Our RL based strategy for
solving QAP sequentially allows us to significantly reduce the combinatioral
complexity for graph matching. Preliminary experiments are performed
on both a synthetic dataset and a public dataset dedicated to the
semantic segmentation of face images. Results show that the proposed
RL-based ordering dramatically outperforms random ordering, and that
our strategy is about 386 times faster than a global QAP-based approach,
while preserving similar segmentation accuracy.},
publisher={Springer International Publishing},
editor={El Yacoubi, Moun{\^i}m
and Granger, Eric
and Yuen, Pong Chi
and Pal, Umapada
and Vincent, Nicole},
isbn={978-3-031-09037-0},
doi={10.1007/978-3-031-09037-0_5},
url= {https://hal.inria.fr/hal-03633036/},
note={hal-03633036},
booktitle={Pattern Recognition and Artificial Intelligence},
year={2022},
month={June},
pages={47--58},
address={Paris, France},
},


@inproceedings{karaali2022drvnet,
      title={DR-VNet: Retinal Vessel Segmentation via Dense Residual UNet},
      author={Ali Karaali and Rozenn Dahyot and Donal J. Sexton},
      year={2022},
	  booktitle={Pattern Recognition and Artificial Intelligence},
	  doi={10.1007/978-3-031-09037-0_17},
	  note={Github https://github.com/alikaraali/DR-VNet, ArXivDOI:10.48550/arXiv.2111.04739},
	  url= {https://arxiv.org/pdf/2111.04739.pdf},
	  abstract={Accurate retinal vessel segmentation is an important task for many computer-aided diagnosis systems. Yet, it is still a challenging problem due to the complex vessel structures of an eye. Numerous vessel segmentation methods have been proposed recently, however more research is needed to deal with poor segmentation of thin and tiny vessels. To address this, we propose a new deep learning pipeline combining the efficiency of residual dense net blocks and, residual squeeze and excitation blocks. We validate experimentally our approach on three datasets and show that our pipeline outperforms current state of the art techniques on the sensitivity metric relevant to assess capture of small vessels.},
	  publisher={Springer International Publishing},
editor={El Yacoubi, Moun{\^i}m
and Granger, Eric
and Yuen, Pong Chi
and Pal, Umapada
and Vincent, Nicole},
isbn={978-3-031-09037-0},
      volume= {abs/2111.04739},
	  month={June},
address={Paris, France},
      archivePrefix={arXiv},
      primaryClass={eess.IV}
}
,


@inproceedings{ChaoImvip2021,
author= {C.-J. Liu and Matej Ulicny and Michael Manzke and  Rozenn Dahyot},
title= {Context Aware Object Geotagging},
booktitle= {Irish Machine Vision and Image Processing (IMVIP 2021)},
volume= {},
year= {2021},
abstract={We propose an approach for geolocating assets from street view imagery
by improving the quality of the metadata associated with the images using
Structure from Motion, and by using contextual geographic information extracted
from OpenStreetMap. Our pipeline is validated experimentally against the state of
 the art approaches for geotagging traffic lights.},
url= {https://arxiv.org/pdf/2108.06302.pdf},
doi={10.48550/arXiv.2108.06302},
note={},
archivePrefix= {arXiv},
eprint= {},
timestamp= {},
biburl= {},
bibsource= {}
},

@article{McDonnell2021,
 title= {Model for predicting perception of facial action unit activation using virtual humans},
 journal= {Computers \& Graphics },
doi = {10.1016/j.cag.2021.07.022},
 volume= {100},
 pages= {81-92},
 year= {2021},
 note= {Winner 2022 Graphics Replicability Stamp Initiative (GRSI) best paper award; Github: https://github.com/Roznn/facial-blendshapes},
 issn= {0097-8493},
 url= {https://roznn.github.io/facial-blendshapes/CAG2021.pdf},
 author= {Rachel McDonnell and Katja Zibrek and Emma Carrigan and Rozenn Dahyot},
 keywords= {facial action unit, perception, virtual character},
 abstract= {Blendshape facial rigs are used extensively in the industry for facial animation of
virtual humans. However, storing and manipulating large numbers of facial meshes
(blendshapes) is costly in terms of memory and computation for gaming applications.
Blendshape rigs are comprised of sets of semantically-meaningful expressions, which
govern how expressive the character will be, often based on Action Units from the Facial
Action Coding System (FACS). However, the relative perceptual importance of blendshapes has not yet been investigated. Research in Psychology and Neuroscience has
shown that our brains process faces differently than other objects so we postulate that
the perception of facial expressions will be feature-dependent rather than based purely
on the amount of movement required to make the expression. Therefore, we believe that
perception of blendshape visibility will not be reliably predicted by numerical calculations of the difference between the expression and the neutral mesh. In this paper, we
explore the noticeability of blendshapes under different activation levels, and present
new perceptually-based models to predict perceptual importance of blendshapes. The
models predict visibility based on commonly-used geometry and image-based metrics.}
 },


@inproceedings{alghamdi2021sliced,
      title = {Sliced L2 Distance for Colour Grading},
      author = {Hana Alghamdi and Rozenn Dahyot},
	  booktitle = {2021 29th European Signal Processing Conference (EUSIPCO)},
	  doi = {10.23919/EUSIPCO54536.2021.9616260},
      year = {2021},
	  volume={},
      number={},
      pages={671-675},
      eprint = {2102.09297},
	  archivePrefix = {arXiv},
      primaryClass = {cs.CV},
	  abstract = {We propose a new method with L2 distance that maps one N-dimensional distribution to another,
	  taking into account available information about correspondences. We solve the high-dimensional problem
	  in 1D space using an iterative projection approach. To show the potentials of this mapping, we apply it
	  to colour transfer between two images that exhibit overlapped scenes. Experiments show quantitative and
	  qualitative competitive results as compared with the state of the art colour transfer methods.},
	  note={https://arxiv.org/pdf/2102.09297.pdf},
	  url = {https://eurasip.org/Proceedings/Eusipco/Eusipco2021/pdfs/0000671.pdf}
},