-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbookmarks.go
More file actions
432 lines (391 loc) · 11.7 KB
/
bookmarks.go
File metadata and controls
432 lines (391 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
// bookmarks.go
// ============================================================================
// Author: Coislinianus 120
// Date created: February 2025
// Version = "1.0"
// License = "CC0 1.0"
// ============================================================================
//
// A utility that parses bookmark files (HTML export and JSON), deduplicate entries,
// fetch each bookmark URL, extract headline, paragraph and meta description content,
// and write the augmented results to a CSV file.
//
// Features:
// - Parses browser-style HTML bookmark export and nested JSON bookmark structures.
// - Removes duplicate links.
// - Concurrent content fetching with a configurable worker limit and per-request timeouts.
// - Simple media-file detection to skip downloading large media resources.
// - Cleans extracted text and writes results to bookmarks_dataset.csv in the program working directory.
//
// Usage:
// - Place bookmark files (.html and .json) inside a "bookmarks" directory in the current working directory.
// - Run the program; results are written to bookmarks_dataset.csv.
//
// Notes:
// - HTTP client has a timeout and simple transport configuration.
// - The code uses goquery to parse HTML from local files and fetched pages.
package main
import (
"context"
"encoding/csv"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net"
"net/http"
"os"
"path/filepath"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
)
// Bookmark represents a bookmark entry parsed from HTML or JSON input.
//
// The struct contains original bookmark metadata (Title, Link, AddDate, LastModified)
// and fields populated after content retrieval (LastChecked, Active, HeadersContent,
// Paragraphs, MetaDescription).
type Bookmark struct {
Title string
Link string
AddDate string
LastModified string
// Fields appended after content retrieval:
LastChecked string
Active string
HeadersContent string
Paragraphs string
MetaDescription string
}
// HTTPClient is a shared HTTP client with timeouts and a lightweight custom transport.
// Timeout and Dialer values are conservative to avoid long hangs during mass fetches.
var HTTPClient = &http.Client{
Timeout: 4 * time.Second,
Transport: &http.Transport{
DialContext: (&net.Dialer{
Timeout: 5 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
},
}
// cleanText removes unwanted whitespace and problematic characters from extracted text.
//
// It replaces newlines, tabs and carriage returns with spaces, removes single quotes,
// and collapses multiple adjacent spaces into a single space.
func cleanText(text string) string {
if text == "" {
return ""
}
replacer := strings.NewReplacer("\n", " ", "\t", " ", "\r", " ", "'", " ")
cleaned := replacer.Replace(text)
return strings.Join(strings.Fields(cleaned), " ")
}
// parseHTMLFile reads an HTML file and extracts bookmark entries.
//
// The function uses goquery to find <a> elements and reads title, href, add_date,
// and last_modified attributes where present.
func parseHTMLFile(filePath string) ([]Bookmark, error) {
data := []Bookmark{}
bytes, err := ioutil.ReadFile(filePath)
if err != nil {
return data, err
}
doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(bytes)))
if err != nil {
return data, err
}
log.Printf("Parsing HTML: %s\n", filePath)
doc.Find("a").Each(func(i int, s *goquery.Selection) {
title := strings.TrimSpace(s.Text())
link, _ := s.Attr("href")
addDate, _ := s.Attr("add_date")
lastModified, _ := s.Attr("last_modified")
data = append(data, Bookmark{
Title: title,
Link: link,
AddDate: addDate,
LastModified: lastModified,
})
})
return data, nil
}
// parseJSONFile reads a JSON bookmark file and recursively extracts bookmark entries.
//
// It supports nested structures by walking maps and slices, looking for objects that
// contain a "uri" field. Common fields like "title", "dateAdded", and "lastModified"
// are extracted when present.
func parseJSONFile(filePath string) ([]Bookmark, error) {
data := []Bookmark{}
bytes, err := ioutil.ReadFile(filePath)
if err != nil {
return data, err
}
var jsonData interface{}
if err = json.Unmarshal(bytes, &jsonData); err != nil {
return data, err
}
log.Printf("Parsing JSON: %s\n", filePath)
var extractBookmarks func(node interface{})
extractBookmarks = func(node interface{}) {
switch v := node.(type) {
case map[string]interface{}:
if uri, exists := v["uri"]; exists {
var title, addDate, lastModified string
if tt, ok := v["title"].(string); ok {
title = tt
}
link := fmt.Sprintf("%v", uri)
if dt, ok := v["dateAdded"].(string); ok {
addDate = dt
}
if lm, ok := v["lastModified"].(string); ok {
lastModified = lm
}
data = append(data, Bookmark{
Title: title,
Link: link,
AddDate: addDate,
LastModified: lastModified,
})
}
if children, exists := v["children"]; exists {
extractBookmarks(children)
}
case []interface{}:
for _, item := range v {
extractBookmarks(item)
}
}
}
extractBookmarks(jsonData)
return data, nil
}
// removeDuplicates filters out bookmarks with duplicate Link values.
//
// Empty links are skipped. The original order is preserved for the first occurrence
// of each unique link.
func removeDuplicates(data []Bookmark) []Bookmark {
seen := make(map[string]bool)
unique := []Bookmark{}
for _, bm := range data {
if bm.Link == "" {
continue
}
if _, found := seen[bm.Link]; !found {
seen[bm.Link] = true
unique = append(unique, bm)
}
}
return unique
}
// extractContent performs an HTTP GET for the bookmark Link and extracts content.
//
// Returns the updated Bookmark and three booleans describing the outcome:
// - active: whether the page was considered active and content extracted
// - httpErr: whether an HTTP or request construction error occurred
// - tm: whether a timeout/abort occurred.
//
// The function detects common media file suffixes and marks them inactive without fetching.
func extractContent(bm Bookmark) (Bookmark, bool, bool, bool) {
today := time.Now().Format("2006-01-02")
mediaSuffixes := []string{".mp3", ".m3u", ".pls", ".aac", ".wav", ".ogg"}
lowerLink := strings.ToLower(bm.Link)
for _, suffix := range mediaSuffixes {
if strings.HasSuffix(lowerLink, suffix) {
bm.LastChecked = today
bm.Active = "no"
return bm, false, true, false
}
}
req, err := http.NewRequest("GET", bm.Link, nil)
if err != nil {
bm.LastChecked = today
bm.Active = "no"
return bm, false, true, false
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0")
resp, err := HTTPClient.Do(req)
if err != nil {
bm.LastChecked = today
bm.Active = "no"
return bm, false, true, false
}
defer resp.Body.Close()
// Treat status codes above 403 as inactive/unusable.
if resp.StatusCode > 403 {
bm.LastChecked = today
bm.Active = "no"
return bm, false, true, false
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
bm.LastChecked = today
bm.Active = "no"
return bm, false, true, false
}
var hdrTexts []string
doc.Find("h1, h2, h3, h4, h5, h6").Each(func(i int, s *goquery.Selection) {
hdrTexts = append(hdrTexts, s.Text())
})
hContent := cleanText(strings.Join(hdrTexts, " "))
var pTexts []string
doc.Find("p").Each(func(i int, s *goquery.Selection) {
pTexts = append(pTexts, s.Text())
})
pContent := cleanText(strings.Join(pTexts, " "))
metaDesc, exists := doc.Find("meta[name='description']").Attr("content")
if !exists {
metaDesc = ""
}
metaContent := cleanText(metaDesc)
bm.LastChecked = today
bm.Active = "yes"
bm.HeadersContent = hContent
bm.Paragraphs = pContent
bm.MetaDescription = metaContent
return bm, true, false, false
}
func main() {
// Determine working paths (bookmarks folder and output CSV).
currentDir, err := os.Getwd()
if err != nil {
log.Printf("Error getting working directory: %v\n", err)
return
}
folderPath := filepath.Join(currentDir, "bookmarks")
csvFilePath := filepath.Join(currentDir, "bookmarks_dataset.csv")
var allData []Bookmark
htmlCount := 0
jsonCount := 0
// Walk the bookmarks folder for .html and .json files.
err = filepath.Walk(folderPath, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.IsDir() {
return nil
}
lowerName := strings.ToLower(info.Name())
if strings.HasSuffix(lowerName, ".html") {
htmlCount++
bms, err := parseHTMLFile(path)
if err != nil {
log.Printf("Error parsing HTML file %s: %v\n", path, err)
} else {
allData = append(allData, bms...)
}
} else if strings.HasSuffix(lowerName, ".json") {
jsonCount++
bms, err := parseJSONFile(path)
if err != nil {
log.Printf("Error parsing JSON file %s: %v\n", path, err)
} else {
allData = append(allData, bms...)
}
}
return nil
})
if err != nil {
log.Printf("Error walking bookmarks folder: %v\n", err)
return
}
totalFiles := htmlCount + jsonCount
log.Printf("Found %d files in %s\n\t- %d HTML files\n\t- %d JSON files.\n", totalFiles, folderPath, htmlCount, jsonCount)
log.Printf("Found %d bookmark entries. Checking for duplicates...\n", len(allData))
uniqueData := removeDuplicates(allData)
duplicates := len(allData) - len(uniqueData)
log.Printf("Removed %d duplicate entries.\nGathering content data...\n", duplicates)
// Concurrent processing setup.
var wg sync.WaitGroup
resultCh := make(chan Bookmark, len(uniqueData))
var mu sync.Mutex
activeCount, httpErrorCount, timeoutCount := 0, 0, 0
maxWorkers := 20
sem := make(chan struct{}, maxWorkers)
startTime := time.Now()
for _, bm := range uniqueData {
wg.Add(1)
sem <- struct{}{}
go func(b Bookmark) {
defer wg.Done()
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
type result struct {
bm Bookmark
active bool
httpErr, tm bool
}
resCh := make(chan result, 1)
go func() {
updated, active, httpErr, tm := extractContent(b)
resCh <- result{updated, active, httpErr, tm}
}()
var res result
select {
case res = <-resCh:
case <-ctx.Done():
b.LastChecked = time.Now().Format("2006-01-02")
b.Active = "no"
res = result{b, false, false, true}
}
mu.Lock()
if res.active {
activeCount++
}
if res.httpErr {
httpErrorCount++
}
if res.tm {
timeoutCount++
}
mu.Unlock()
resultCh <- res.bm
elapsed := time.Since(startTime)
log.Printf("\rProcessed bookmark: %s | Elapsed: %v", b.Link, elapsed)
<-sem
}(bm)
}
wg.Wait()
close(resultCh)
var results []Bookmark
for bm := range resultCh {
results = append(results, bm)
}
log.Println("\nContent extraction complete.")
log.Printf("Valid entries: %d\nHTTP Errors: %d\nTimeout errors: %d\n", activeCount, httpErrorCount, timeoutCount)
log.Println("Writing data to CSV file...")
csvFile, err := os.Create(csvFilePath)
if err != nil {
log.Printf("Error creating CSV file: %v\n", err)
return
}
defer csvFile.Close()
writer := csv.NewWriter(csvFile)
header := []string{"title", "link", "add_date", "last_modified", "last_checked", "active", "h_tag_content", "p", "meta_description"}
if err := writer.Write(header); err != nil {
log.Printf("Error writing header: %v\n", err)
return
}
for _, bm := range results {
row := []string{
bm.Title,
bm.Link,
bm.AddDate,
bm.LastModified,
bm.LastChecked,
bm.Active,
bm.HeadersContent,
bm.Paragraphs,
bm.MetaDescription,
}
if err := writer.Write(row); err != nil {
log.Printf("Error writing row: %v\n", err)
}
}
writer.Flush()
if err := writer.Error(); err != nil {
log.Printf("Error flushing CSV: %v\n", err)
}
log.Println("Done.")
}