westlake-encode-lab.github.io/_bibliography/papers.bib at main · westlake-encode-lab/westlake-encode-lab.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
---
---

% 2026

% 2026 - Conference papers
@inproceedings{tao2026omnizip,
  bibtex_show = {true},
         abbr = {CVPR},
        title = {OmniZip: Audio-Guided Dynamic Token Compression for Fast Omnimodal Large Language Models},
       author = {Tao, Keda and Shao, Kele and Yu, Bohan and Wang, Weiqiang and Liu, Jian and Wang, Huan},
    booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
         year = {2026},
        arxiv = {2511.14582},
          pdf = {https://arxiv.org/pdf/2511.14582},
         code = {https://github.com/KD-TAO/OmniZip},
     selected = {true}
}

@inproceedings{chen2026streamingtom,
  bibtex_show = {true},
         abbr = {CVPR},
        title = {StreamingTOM: Streaming Token Compression for Efficient Video Understanding},
       author = {Chen, Xueyi and Tao, Keda and Shao, Kele and Wang, Huan},
    booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
         year = {2026},
        arxiv = {2510.18269},
          pdf = {https://arxiv.org/pdf/2510.18269},
      website = {https://yige24.github.io/StreamingTOM},
         code = {https://github.com/YIGE24/StreamingTOM},
     selected = {true}
}

@inproceedings{feng2026reasonmap,
  bibtex_show = {true},
         abbr = {CVPR},
        title = {Can MLLMs Guide Me Home? A Benchmark Study on Fine-Grained Visual Reasoning from Transit Maps},
       author = {Feng, Sicheng and Wang, Song and Ouyang, Shuyi and Kong, Lingdong and Song, Zikai and Zhu, Jianke and Wang, Huan and Wang, Xinchao},
    booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
         year = {2026},
        arxiv = {2505.18675},
          pdf = {https://arxiv.org/pdf/2505.18675},
      website = {https://fscdc.github.io/Reason-Map/},
         code = {https://github.com/fscdc/ReasonMap},
     selected = {true}
}

@inproceedings{jin2026mergemix,
  bibtex_show = {true},
         abbr = {ICLR},
        title = {MergeMix: A Unified Augmentation Paradigm for Visual and Multi-Modal Understanding},
       author = {Jin, Xin and Li, Siyuan and Jian, Siyong and Yu, Kai and Wang, Huan},
    booktitle = {International Conference on Learning Representations (ICLR)},
         year = {2026},
        arxiv = {2510.23479},
          pdf = {https://arxiv.org/pdf/2510.23479},
      website = {https://jinxins.github.io/MergeMix_Web/},
         code = {https://github.com/JinXins/MergeMix},
     selected = {true}
}

@inproceedings{zhu2026obsdiff,
  bibtex_show = {true},
         abbr = {ICLR},
        title = {OBS-Diff: Accurate Pruning For Diffusion Models in One-Shot},
       author = {Zhu, Junhan and Wang, Hesong and Su, Mingluo and Wang, Zefang and Wang, Huan},
    booktitle = {International Conference on Learning Representations (ICLR)},
         year = {2026},
        arxiv = {2510.06751},
          pdf = {https://arxiv.org/pdf/2510.06751},
      website = {https://alrightlone.github.io/OBS-Diff-Webpage/},
         code = {https://github.com/Alrightlone/OBS-Diff},
     selected = {true}
}

@inproceedings{feng2026rewardmap,
  bibtex_show = {true},
         abbr = {ICLR},
        title = {RewardMap: Tackling Sparse Rewards in Fine-grained Visual Reasoning via Multi-Stage Reinforcement Learning},
       author = {Feng, Sicheng and Tuo, Kaiwen and Wang, Song and Kong, Lingdong and Zhu, Jianke and Wang, Huan},
    booktitle = {International Conference on Learning Representations (ICLR)},
         year = {2026},
        arxiv = {2510.02240},
          pdf = {https://arxiv.org/pdf/2510.02240},
      website = {https://fscdc.github.io/RewardMap/},
         code = {https://github.com/fscdc/RewardMap},
     selected = {true}
}

@inproceedings{li2026arpg,
  bibtex_show = {true},
         abbr = {ICLR},
        title = {Autoregressive Image Generation with Randomized Parallel Decoding},
       author = {Li, Haopeng and Yang, Jinyue and Li, Guoqi and Wang, Huan},
    booktitle = {International Conference on Learning Representations (ICLR)},
         year = {2026},
        arxiv = {2503.10568},
          pdf = {https://arxiv.org/pdf/2503.10568},
      website = {https://hp-l33.github.io/projects/arpg},
         code = {https://github.com/hp-l33/ARPG},
     selected = {true}
}

@inproceedings{su2026rose,
  bibtex_show = {true},
         abbr = {CPAL Oral},
        title = {ROSE: Reordered SparseGPT for More Accurate One-Shot Large Language Models Pruning},
       author = {Su, Mingluo and Wang, Huan},
    booktitle = {Conference on Parsimony and Learning (CPAL)},
         year = {2026},
        arxiv = {2603.05878},
          pdf = {https://arxiv.org/pdf/2603.05878},
      website = {https://mingluo-su.github.io/ROSE-Webpage/},
         code = {https://github.com/mingluo-su/ROSE}
}

@inproceedings{bai2026ressvd,
  bibtex_show = {true},
         abbr = {CPAL},
        title = {ResSVD: Residual Compensated SVD for Large Language Model Compression},
       author = {Bai, Haolei and Jian, Siyong and Liang, Tuo and Yin, Yu and Wang, Huan},
    booktitle = {Conference on Parsimony and Learning (CPAL)},
         year = {2026},
        arxiv = {2505.20112},
          pdf = {https://arxiv.org/pdf/2505.20112},
         code = {https://github.com/deadlykitten4/ERC-SVD}
}

% 2026 - Journal papers
@article{shao2026tokens,
  bibtex_show = {true},
         abbr = {TMLR},
        title = {When Tokens Talk Too Much: A Survey of Multimodal Long-Context Token Compression across Images, Videos, and Audios},
       author = {Shao, Kele and Tao, Keda and Zhang, Kejia and Feng, Sicheng and Cai, Mu and Shang, Yuzhang and You, Haoxuan and Qin, Can and Sui, Yang and Wang, Huan},
      journal = {Transactions on Machine Learning Research (TMLR)},
         year = {2026},
        arxiv = {2507.20198},
          pdf = {https://arxiv.org/pdf/2507.20198},
         code = {https://github.com/cokeshao/Awesome-Multimodal-Token-Compression}
}

% 2026 - Preprints (newest first by arxiv number)
@article{tao2026lvomnibench,
  bibtex_show = {true},
         abbr = {arXiv},
        title = {LVOmniBench: Pioneering Long Audio-Video Understanding Evaluation for Omnimodal LLMs},
       author = {Tao, Keda and Zheng, Yuhua and Xu, Jia and Du, Wenjie and Shao, Kele and Wang, Hesong and Chen, Xueyi and Jin, Xin and Zhu, Junhan and Yu, Bohan and Wang, Weiqiang and Liu, Jian and Qin, Can and Zhang, Yulun and Yang, Ming-Hsuan and Wang, Huan},
      journal = {arXiv preprint arXiv:2603.19217},
         year = {2026},
        arxiv = {2603.19217},
          pdf = {https://arxiv.org/pdf/2603.19217},
      website = {https://kd-tao.github.io/LVOmniBench/},
         code = {https://github.com/KD-TAO/LVOmniBench}
}

@article{zou2026mobilekernelbench,
  bibtex_show = {true},
         abbr = {arXiv},
        title = {MobileKernelBench: Can LLMs Write Efficient Kernels for Mobile Devices?},
       author = {Zou, Xingze and Wang, Jing and Zheng, Yuhua and Chen, Xueyi and Bai, Haolei and Kong, Lingcheng and Abu-Bakar, Syed A.R. and Wang, Zhaode and Lv, Chengfei and Hu, Haoji and Wang, Huan},
      journal = {arXiv preprint arXiv:2603.11935},
         year = {2026},
        arxiv = {2603.11935},
          pdf = {https://arxiv.org/pdf/2603.11935},
      website = {https://zeezou-isee.github.io/Mobilekernelbench/},
         code = {https://github.com/zeezou-isee/Mobilekernelbench}
}

@article{bai2026dice,
  bibtex_show = {true},
         abbr = {arXiv},
        title = {DICE: Diffusion Large Language Models Excel at Generating CUDA Kernels},
       author = {Bai, Haolei and Kong, Lingcheng and Chen, Xueyi and Wang, Jianmian and Tao, Zhiqiang and Wang, Huan},
      journal = {arXiv preprint arXiv:2602.11715},
         year = {2026},
        arxiv = {2602.11715},
          pdf = {https://arxiv.org/pdf/2602.11715},
      website = {https://deadlykitten4.github.io/DICE/},
         code = {https://github.com/deadlykitten4/DICE}
}

% 2025

% 2025 - Conference papers
@inproceedings{shao2025holitom,
  bibtex_show = {true},
         abbr = {NeurIPS},
        title = {HoliTom: Holistic Token Merging for Fast Video Large Language Models},
       author = {Shao, Kele and Tao, Keda and Qin, Can and You, Haoxuan and Sui, Yang and Wang, Huan},
    booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
         year = {2025},
        arxiv = {2505.21334},
          pdf = {https://arxiv.org/pdf/2505.21334},
      website = {https://cokeshao.github.io/HoliTom_Web/},
         code = {https://github.com/cokeshao/HoliTom},
     selected = {true}
}

@inproceedings{zhang2025poisoncure,
  bibtex_show = {true},
         abbr = {NeurIPS},
        title = {Poison as Cure: Visual Noise for Mitigating Object Hallucinations in LVMs},
       author = {Zhang, Kejia and Tao, Keda and Tang, Jiasheng and Wang, Huan},
    booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
         year = {2025},
        arxiv = {2501.19164},
          pdf = {https://arxiv.org/pdf/2501.19164},
      website = {https://kejiazhang-robust.github.io/poison-cure-lvm},
         code = {https://github.com/KejiaZhang-Robust/VAP}
}

@inproceedings{li2025freqexit,
  bibtex_show = {true},
         abbr = {NeurIPS},
        title = {FreqExit: Enabling Early-Exit Inference for Visual Autoregressive Models via Frequency-Aware Guidance},
       author = {Li, Ying and Lv, Chengfei and Wang, Huan},
    booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
         year = {2025},
          pdf = {https://openreview.net/pdf?id=DUlZTgLkeh},
      website = {https://neuraliying.github.io/FreqExit/},
         code = {https://github.com/NeuraLiying/FreqExit},
     selected = {true}
}

@inproceedings{tao2025dycoke,
  bibtex_show = {true},
         abbr = {CVPR},
        title = {DyCoke: Dynamic Compression of Tokens for Fast Video Large Language Models},
       author = {Tao, Keda and Qin, Can and You, Haoxuan and Sui, Yang and Wang, Huan},
    booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
         year = {2025},
        arxiv = {2411.15024},
          pdf = {https://arxiv.org/pdf/2411.15024},
         code = {https://github.com/KD-TAO/DyCoke},
     selected = {true}
}

@inproceedings{wu2025ondevice,
  bibtex_show = {true},
         abbr = {ICCV},
        title = {On-Device Diffusion Transformer Policy for Efficient Robot Manipulation},
       author = {Wu, Yiming and Wang, Huan and Chen, Zhenghao and Pang, Jianxin and Xu, Dong},
    booktitle = {IEEE/CVF International Conference on Computer Vision (ICCV)},
         year = {2025},
        arxiv = {2508.00697},
          pdf = {https://arxiv.org/pdf/2508.00697}
}

% 2025 - Journal papers
@article{wu2025niagara,
  bibtex_show = {true},
         abbr = {TCSVT},
        title = {Niagara: Normal-Integrated Geometric Affine Field for Scene Reconstruction from a Single View},
       author = {Wu, Xianzu and Ai, Zhenxin and Yang, Harry and Lim, Ser-Nam and Liu, Jun and Wang, Huan},
      journal = {IEEE Transactions on Circuits and Systems for Video Technology},
         year = {2025},
        arxiv = {2503.12553},
          pdf = {https://arxiv.org/pdf/2503.12553},
      website = {https://ai-kunkun.github.io/Niagara_page/},
         code = {https://github.com/xianzuwu/Niagara}
}

% 2025 - Preprints (newest first by arxiv number)
@article{tao2025omniagent,
  bibtex_show = {true},
         abbr = {arXiv},
        title = {Active Perception Agent for Omnimodal Audio-Video Understanding},
       author = {Tao, Keda and Du, Wenjie and Yu, Bohan and Wang, Weiqiang and Liu, Jian and Wang, Huan},
      journal = {arXiv preprint arXiv:2512.23646},
         year = {2025},
        arxiv = {2512.23646},
          pdf = {https://arxiv.org/pdf/2512.23646},
      website = {https://kd-tao.github.io/OmniAgent/},
         code = {https://github.com/KD-TAO/OmniAgent}
}

@article{du2025rlkv,
  bibtex_show = {true},
         abbr = {arXiv},
        title = {Which Heads Matter for Reasoning? RL-Guided KV Cache Compression},
       author = {Du, Wenjie and Jiang, Li and Tao, Keda and Liu, Xue and Wang, Huan},
      journal = {arXiv preprint arXiv:2510.08525},
         year = {2025},
        arxiv = {2510.08525},
          pdf = {https://arxiv.org/pdf/2510.08525},
      website = {https://kurt232.github.io/RLKV/},
         code = {https://github.com/kurt232/RLKV}
}

@article{kong2025concur,
  bibtex_show = {true},
         abbr = {arXiv},
        title = {ConCuR: Conciseness Makes State-of-the-Art Kernel Generation},
       author = {Kong, Lingcheng and Wei, Jiateng and Shen, Hanzhang and Wang, Huan},
      journal = {arXiv preprint arXiv:2510.07356},
         year = {2025},
        arxiv = {2510.07356},
          pdf = {https://arxiv.org/pdf/2510.07356}
}

@article{tuo2025sparsessm,
  bibtex_show = {true},
         abbr = {arXiv},
        title = {SparseSSM: Efficient Selective Structured State Space Models Can Be Pruned in One-Shot},
       author = {Tuo, Kaiwen and Wang, Huan},
      journal = {arXiv preprint arXiv:2506.09613},
         year = {2025},
        arxiv = {2506.09613},
          pdf = {https://arxiv.org/pdf/2506.09613},
      website = {https://cfintech.github.io/SparseSSM-Web/},
         code = {https://github.com/CFinTech/SparseSSM}
}

@article{tao2025plugandplay,
  bibtex_show = {true},
         abbr = {arXiv},
        title = {Plug-and-Play 1.x-Bit KV Cache Quantization for Video Large Language Models},
       author = {Tao, Keda and You, Haoxuan and Sui, Yang and Qin, Can and Wang, Huan},
      journal = {arXiv preprint arXiv:2503.16257},
         year = {2025},
        arxiv = {2503.16257},
          pdf = {https://arxiv.org/pdf/2503.16257},
         code = {https://github.com/KD-TAO/VidKV}
}

% 2024

% 2024 - Preprints
@article{feng2024oracle,
  bibtex_show = {true},
         abbr = {arXiv},
        title = {Is Oracle Pruning the True Oracle?},
       author = {Feng, Sicheng and Tao, Keda and Wang, Huan},
      journal = {arXiv preprint arXiv:2412.00143},
         year = {2024},
        arxiv = {2412.00143},
          pdf = {https://arxiv.org/pdf/2412.00143},
      website = {https://fscdc.github.io/Oracle-Pruning-Sanity-Check/},
         code = {https://github.com/fscdc/Oracle-Pruning-Sanity-Check}
}