-
-
Notifications
You must be signed in to change notification settings - Fork 295
Expand file tree
/
Copy pathRssHelper.kt
More file actions
257 lines (232 loc) · 10.1 KB
/
RssHelper.kt
File metadata and controls
257 lines (232 loc) · 10.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
package me.ash.reader.infrastructure.rss
import android.content.Context
import android.util.Log
import com.rometools.modules.mediarss.MediaEntryModule
import com.rometools.modules.mediarss.MediaModule
import com.rometools.modules.mediarss.types.UrlReference
import com.rometools.rome.feed.synd.SyndEntry
import com.rometools.rome.feed.synd.SyndFeed
import com.rometools.rome.feed.synd.SyndImageImpl
import com.rometools.rome.io.SyndFeedInput
import com.rometools.rome.io.XmlReader
import dagger.hilt.android.qualifiers.ApplicationContext
import java.nio.charset.Charset
import java.util.*
import javax.inject.Inject
import kotlinx.coroutines.CoroutineDispatcher
import kotlinx.coroutines.withContext
import me.ash.reader.domain.model.article.Article
import me.ash.reader.domain.model.feed.Feed
import me.ash.reader.domain.repository.FeedDao
import me.ash.reader.infrastructure.di.IODispatcher
import me.ash.reader.infrastructure.html.Readability
import me.ash.reader.ui.ext.currentAccountId
import me.ash.reader.ui.ext.decodeHTML
import me.ash.reader.ui.ext.extractDomain
import me.ash.reader.ui.ext.isFuture
import me.ash.reader.ui.ext.isTooOld
import me.ash.reader.ui.ext.spacerDollar
import okhttp3.OkHttpClient
import okhttp3.Request
import okhttp3.executeAsync
import okhttp3.internal.commonIsSuccessful
import okio.IOException
import org.jsoup.Jsoup
val enclosureRegex = """<enclosure\s+url="([^"]+)"\s+type=".*"\s*/>""".toRegex()
val imgRegex = """img.*?src=(["'])((?!data).*?)\1""".toRegex(RegexOption.DOT_MATCHES_ALL)
/** Some operations on RSS. */
class RssHelper
@Inject
constructor(
@ApplicationContext private val context: Context,
@IODispatcher private val ioDispatcher: CoroutineDispatcher,
private val okHttpClient: OkHttpClient,
) {
@Throws(Exception::class)
suspend fun searchFeed(feedLink: String): SyndFeed {
return withContext(ioDispatcher) {
val response = response(okHttpClient, feedLink)
val contentType = response.header("Content-Type")
val httpContentType =
contentType?.let {
if (it.contains("charset=", ignoreCase = true)) it
else "$it; charset=UTF-8"
} ?: "text/xml; charset=UTF-8"
response.body.byteStream().use { inputStream ->
SyndFeedInput().build(XmlReader(inputStream, httpContentType)).also {
it.icon = SyndImageImpl()
it.icon.link = queryRssIconLink(feedLink)
it.icon.url = it.icon.link
}
}
}
}
@Throws(Exception::class)
suspend fun parseFullContent(link: String, title: String): String {
return withContext(ioDispatcher) {
val response = response(okHttpClient, link)
if (response.commonIsSuccessful) {
val responseBody = response.body
val charset = responseBody.contentType()?.charset()
val content =
responseBody.source().use {
if (charset != null) {
return@use it.readString(charset)
}
val peekContent = it.peek().readString(Charsets.UTF_8)
val charsetFromMeta =
runCatching {
val element =
Jsoup.parse(peekContent, link)
.selectFirst("meta[http-equiv=content-type]")
return@runCatching if (element == null) Charsets.UTF_8
else {
element
.attr("content")
.substringAfter("charset=")
.removeSurrounding("\"")
.lowercase()
.let { Charset.forName(it) }
}
}
.getOrDefault(Charsets.UTF_8)
if (charsetFromMeta == Charsets.UTF_8) {
peekContent
} else {
it.readString(charsetFromMeta)
}
}
val articleContent = Readability.parseToElement(content, link)
articleContent?.let {
val h1Element = articleContent.selectFirst("h1")
if (h1Element != null && h1Element.hasText() && h1Element.text() == title) {
h1Element.remove()
}
articleContent.toString()
} ?: throw IOException("articleContent is null")
} else throw IOException(response.message)
}
}
suspend fun queryRssXml(
feed: Feed,
latestLink: String?,
preDate: Date = Date(),
): List<Article> =
try {
val accountId = context.currentAccountId
val response = response(okHttpClient, feed.url)
val contentType = response.header("Content-Type")
val httpContentType =
contentType?.let {
if (it.contains("charset=", ignoreCase = true)) it
else "$it; charset=UTF-8"
} ?: "text/xml; charset=UTF-8"
response.body.byteStream().use { inputStream ->
SyndFeedInput()
.apply { isPreserveWireFeed = true }
.build(XmlReader(inputStream, httpContentType))
.entries
.asSequence()
.takeWhile { latestLink == null || latestLink != it.link }
.map { buildArticleFromSyndEntry(feed, accountId, it, preDate) }
.toList()
}
} catch (e: Exception) {
e.printStackTrace()
Log.e("RLog", "queryRssXml[${feed.name}]: ${e.message}")
listOf()
}
fun buildArticleFromSyndEntry(
feed: Feed,
accountId: Int,
syndEntry: SyndEntry,
preDate: Date = Date(),
): Article {
val desc = syndEntry.description?.value
val content =
syndEntry.contents
.takeIf { it.isNotEmpty() }
?.let { it.joinToString("\n") { it.value } }
// Log.i(
// "RLog",
// "request rss:\n" +
// "name: ${feed.name}\n" +
// "feedUrl: ${feed.url}\n" +
// "url: ${syndEntry.link}\n" +
// "title: ${syndEntry.title}\n" +
// "desc: ${desc}\n" +
// "content: ${content}\n"
// )
return Article(
id = accountId.spacerDollar(UUID.randomUUID().toString()),
accountId = accountId,
feedId = feed.id,
date =
syndEntry.publishedDate?.takeIf { !it.isFuture(preDate) && !it.isTooOld() }
?: syndEntry.updatedDate?.takeIf { !it.isFuture(preDate) && !it.isTooOld() }
?: preDate,
title = syndEntry.title.decodeHTML() ?: feed.name,
author = syndEntry.author,
rawDescription = content ?: desc ?: "",
shortDescription = Readability.parseToText(desc ?: content, syndEntry.link).take(280),
// fullContent = content,
img = findThumbnail(syndEntry) ?: findThumbnail(content ?: desc),
link = syndEntry.link ?: "",
updateAt = preDate,
)
}
fun findThumbnail(syndEntry: SyndEntry): String? {
if (syndEntry.enclosures?.firstOrNull()?.url != null) {
return syndEntry.enclosures.first().url
}
val mediaModule = syndEntry.getModule(MediaModule.URI) as? MediaEntryModule
if (mediaModule != null) {
return findThumbnail(mediaModule)
}
return null
}
private fun findThumbnail(mediaModule: MediaEntryModule): String? {
val candidates =
buildList {
add(mediaModule.metadata)
addAll(mediaModule.mediaGroups.map { mediaGroup -> mediaGroup.metadata })
addAll(mediaModule.mediaContents.map { content -> content.metadata })
}
.flatMap { it.thumbnail.toList() }
val thumbnail = candidates.firstOrNull()
if (thumbnail != null) {
return thumbnail.url.toString()
} else {
val imageMedia = mediaModule.mediaContents.firstOrNull { it.medium == "image" }
if (imageMedia != null) {
return (imageMedia.reference as? UrlReference)?.url.toString()
}
}
return null
}
fun findThumbnail(text: String?): String? {
text ?: return null
val enclosure = enclosureRegex.find(text)?.groupValues?.get(1)
if (enclosure?.isNotBlank() == true) {
return enclosure
}
// From https://gitlab.com/spacecowboy/Feeder
// Using negative lookahead to skip data: urls, being inline base64
// And capturing original quote to use as ending quote
// Base64 encoded images can be quite large - and crash database cursors
return imgRegex.find(text)?.groupValues?.get(2)?.takeIf { !it.startsWith("data:") }
}
suspend fun queryRssIconLink(feedLink: String?): String? {
if (feedLink.isNullOrEmpty()) return null
val iconFinder = BestIconFinder(okHttpClient)
val domain = feedLink.extractDomain()
return iconFinder.findBestIcon(domain ?: feedLink).also {
Log.i("RLog", "queryRssIconByLink: get $it from $domain")
}
}
suspend fun saveRssIcon(feedDao: FeedDao, feed: Feed, iconLink: String) {
feedDao.update(feed.copy(icon = iconLink))
}
private suspend fun response(client: OkHttpClient, url: String): okhttp3.Response =
client.newCall(Request.Builder().url(url).build()).executeAsync()
}