diff --git a/.gitignore b/.gitignore index 6cf9326..7796f16 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,10 @@ # logs geckodriver.log + +# Local build outputs +*.xpi +*.zip + +# Local captured exports +*.ndjson diff --git a/FORK-NOTICE.md b/FORK-NOTICE.md new file mode 100644 index 0000000..12f7ac8 --- /dev/null +++ b/FORK-NOTICE.md @@ -0,0 +1,20 @@ +# Pesquisa Social Fork Notice + +This project is a modified fork of Zeeschuimer. + +Original project: +- Zeeschuimer +- Copyright (c) Stijn Peeters +- Original repository: https://github.com/digitalmethodsinitiative/zeeschuimer + +This fork contains local modifications by Danilo F. Marinho and is distributed +with the obligations of the Mozilla Public License 2.0. + +MPL 2.0 compliance notes: +- Files derived from the original MPL-covered code remain under MPL 2.0. +- Original copyright and license notices must be preserved. +- If this extension is distributed in executable form, the corresponding source + code for the MPL-covered files must also be made available. + +Fork contact: +- danilo-f.marinho@hotmail.com diff --git a/README.md b/README.md index 85ff94c..8de4d1e 100644 --- a/README.md +++ b/README.md @@ -15,18 +15,10 @@ be exported as a JSON file or exported to a [4CAT](https://github.com/digitalmet analysis and storage. Zeeschuimer is primarily intended as a companion to 4CAT, but you can also integrate its output into your own analysis pipeline. -Currently, it supports the following platforms: +This fork currently supports the following platforms: * [TikTok](https://www.tiktok.com) (posts and comments) -* [Instagram](https://www.instagram.com) (posts only) -* [X/Twitter](https://www.x.com) -* [LinkedIn](https://www.linkedin.com) -* [9gag](https://9gag.com) -* [Imgur](https://imgur.com) -* [Douyin](https://douyin.com) -* [Gab](https://gab.com) -* [Truth Social](https://truth.social) -* [Pinterest](https://pinterest.com) -* [RedNote/Xiaohongshu](https://xiaohongshu.com) +* [Instagram](https://www.instagram.com) (posts, reels, and comments) +* [X/Twitter](https://www.x.com) (posts and comments) Platform support requires regular maintenance to keep up with changes to the platforms. If something does not work, we welcome issues and pull requests. See 'Limitations' below for some known limitations to data capture. diff --git a/SOURCE-CODE-README.md b/SOURCE-CODE-README.md new file mode 100644 index 0000000..9898995 --- /dev/null +++ b/SOURCE-CODE-README.md @@ -0,0 +1,56 @@ +# Source Code Package for Mozilla Review + +This archive contains the source code used to build the Firefox extension +"Pesquisa Social". + +## Extension identity + +- Add-on name: Pesquisa Social +- Add-on ID: pesquisa-social@local +- Fork contact: danilo-f.marinho@hotmail.com + +## Origin and license + +This project is a modified fork of Zeeschuimer. + +- Original project: https://github.com/digitalmethodsinitiative/zeeschuimer +- Original copyright: Stijn Peeters +- License for MPL-covered files: Mozilla Public License 2.0 + +See these files in the source package: + +- `LICENSE` +- `FORK-NOTICE.md` + +## Build environment + +- Operating system used to produce the submitted package: Windows +- Shell used: PowerShell +- Archive/build script: `build-xpi.ps1` +- Required software: PowerShell with .NET support for `System.IO.Compression` + +No Node.js bundling, transpilation, or webpack build is required for this +extension package. + +## How to reproduce the submitted XPI + +1. Extract this source code package. +2. Open PowerShell in the project root. +3. Run: + +```powershell +powershell -ExecutionPolicy Bypass -File .\build-xpi.ps1 +``` + +4. The script creates: + +```text +pesquisa-social-v1.13.9.xpi +``` + +## Notes for reviewers + +- The package includes third-party assets already present in the repository, + such as Font Awesome and font files. +- The submitted extension package excludes local data exports and other build + artifacts such as `.ndjson`, `.xpi`, and `.zip` files. diff --git a/build-source-zip.ps1 b/build-source-zip.ps1 new file mode 100644 index 0000000..059dfb2 --- /dev/null +++ b/build-source-zip.ps1 @@ -0,0 +1,61 @@ +Add-Type -AssemblyName System.IO.Compression +Add-Type -AssemblyName System.IO.Compression.FileSystem + +$manifest = Get-Content "manifest.json" | ConvertFrom-Json +$version = $manifest.version +$output = "pesquisa-social-source-v$version.zip" +$root = (Get-Location).Path + +$includePaths = @( + "manifest.json", + "popup", + "js", + "modules", + "images", + "fonts", + "inc", + "LICENSE", + "README.md", + "FORK-NOTICE.md", + "SOURCE-CODE-README.md", + "build-xpi.ps1" +) + +if (Test-Path $output) { + Remove-Item -LiteralPath $output -Force +} + +$zip = [System.IO.Compression.ZipFile]::Open($output, [System.IO.Compression.ZipArchiveMode]::Create) + +try { + foreach ($path in $includePaths) { + $fullPath = Join-Path $root $path + + if (Test-Path $fullPath -PathType Leaf) { + [System.IO.Compression.ZipFileExtensions]::CreateEntryFromFile( + $zip, + $fullPath, + ($path -replace '\\', '/'), + [System.IO.Compression.CompressionLevel]::Optimal + ) | Out-Null + continue + } + + if (Test-Path $fullPath -PathType Container) { + Get-ChildItem -LiteralPath $fullPath -Recurse -File | ForEach-Object { + $entryName = $_.FullName.Substring($root.Length).TrimStart('\') -replace '\\', '/' + [System.IO.Compression.ZipFileExtensions]::CreateEntryFromFile( + $zip, + $_.FullName, + $entryName, + [System.IO.Compression.CompressionLevel]::Optimal + ) | Out-Null + } + } + } +} +finally { + $zip.Dispose() +} + +Write-Output "Created $output" diff --git a/build-xpi.ps1 b/build-xpi.ps1 new file mode 100644 index 0000000..82d3dba --- /dev/null +++ b/build-xpi.ps1 @@ -0,0 +1,73 @@ +$manifest = Get-Content "manifest.json" | ConvertFrom-Json +$name = ($manifest.name -replace '[^a-zA-Z0-9_-]+', '-').ToLower().Trim('-') +$version = $manifest.version +$output = "$name-v$version.xpi" + +Add-Type -AssemblyName System.IO.Compression +Add-Type -AssemblyName System.IO.Compression.FileSystem + +$root = (Get-Location).Path + +$excludeNames = @( + ".git", + ".github", + ".build-xpi", + "tests", + "create-zip.sh", + "create-zip-bash.sh", + "build-xpi.ps1" +) + +$excludePatterns = @( + "*.zip", + "*.xpi", + "*.DS_Store", + "*.ndjson" +) + +function Should-Skip($item) { + foreach ($name in $excludeNames) { + if ($item.Name -eq $name) { + return $true + } + } + + foreach ($pattern in $excludePatterns) { + if ($item.Name -like $pattern) { + return $true + } + } + + return $false +} + +if (Test-Path $output) { + Remove-Item -LiteralPath $output -Force +} + +$zip = [System.IO.Compression.ZipFile]::Open($output, [System.IO.Compression.ZipArchiveMode]::Create) + +try { + Get-ChildItem -LiteralPath $root -Recurse -File | ForEach-Object { + $file = $_ + + $segments = $file.FullName.Substring($root.Length).TrimStart('\').Split('\') + foreach ($segment in $segments) { + if ($excludeNames -contains $segment) { + return + } + } + + if (Should-Skip $file) { + return + } + + $entryName = ($file.FullName.Substring($root.Length).TrimStart('\')) -replace '\\', '/' + [System.IO.Compression.ZipFileExtensions]::CreateEntryFromFile($zip, $file.FullName, $entryName, [System.IO.Compression.CompressionLevel]::Optimal) | Out-Null + } +} +finally { + $zip.Dispose() +} + +Write-Output "Created $output" diff --git a/js/zs-background.js b/js/zs-background.js index 10880f7..c7c7db9 100644 --- a/js/zs-background.js +++ b/js/zs-background.js @@ -344,8 +344,6 @@ window.zeeschuimer = { } }); })); - - return; } } }, diff --git a/manifest.json b/manifest.json index b598fa0..df60eaa 100644 --- a/manifest.json +++ b/manifest.json @@ -1,14 +1,14 @@ { - "description": "Collect data while browsing social media platforms and upload it for analysis later", + "description": "Capture social media data while browsing and export it for later analysis", "manifest_version": 2, - "name": "Zeeschuimer", - "version": "1.13.6", - "homepage_url": "https://github.com/digitalmethodsinitiative/zeeschuimer", + "name": "Pesquisa Social", + "version": "1.13.9", + "homepage_url": "mailto:danilo-f.marinho@hotmail.com", "browser_specific_settings": { "gecko": { - "update_url": "https://extensions.digitalmethods.net/updates.json", + "id": "pesquisa-social@local", "data_collection_permissions": { "required": ["none"] } @@ -22,7 +22,7 @@ "browser_action": { "default_icon": "images/zeeschuimer-64.png", - "default_title": "Zeeschuimer Status" + "default_title": "Pesquisa Social" }, "permissions": [ @@ -41,17 +41,9 @@ "modules/tiktok.js", "modules/tiktok-comments.js", "modules/instagram.js", - "modules/linkedin.js", - "modules/9gag.js", - "modules/imgur.js", + "modules/instagram-comments.js", "modules/twitter.js", - "modules/douyin.js", - "modules/gab.js", - "modules/truth.js", - "modules/threads.js", - "modules/pinterest.js", - "modules/rednote.js", - "modules/rednote-comments.js" + "modules/twitter-comments.js" ] } } diff --git a/modules/instagram-comments.js b/modules/instagram-comments.js new file mode 100644 index 0000000..bae0cdd --- /dev/null +++ b/modules/instagram-comments.js @@ -0,0 +1,141 @@ +zeeschuimer.register_module( + 'Instagram (comments)', + 'instagram.com', + function (response, source_platform_url, source_url) { + let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, ''); + + if (!["instagram.com"].includes(domain)) { + return []; + } + + const lower_source_url = source_url.toLowerCase(); + const looks_like_comments_request = lower_source_url.indexOf('/comments') >= 0 + || lower_source_url.indexOf('comments') >= 0 + || lower_source_url.indexOf('comment') >= 0; + + if (!looks_like_comments_request) { + return []; + } + + let data; + try { + if (response.startsWith("for (;;);")) { + response = response.slice("for (;;);".length); + } + data = JSON.parse(response); + } catch (SyntaxError) { + return []; + } + + const media_id_match = source_url.match(/\/media\/([^\/?]+)\/comments/i); + const shortcode_match = source_platform_url.match(/\/(p|reel|reels)\/([^\/?#]+)/i); + const post_id_from_url = media_id_match ? media_id_match[1] : null; + const post_type = shortcode_match ? shortcode_match[1].replace('reels', 'reel') : 'p'; + const post_shortcode = shortcode_match ? shortcode_match[2] : null; + const post_url = post_shortcode ? 'https://www.instagram.com/' + post_type + '/' + post_shortcode + '/' : source_platform_url; + + let comments = []; + let seen = new Set(); + + const normalise_user = function (user) { + if (!user) { + return null; + } + + const user_id = user['pk'] || user['pk_id'] || user['id']; + + return { + id: user_id ? String(user_id) : undefined, + unique_id: user['username'], + nickname: user['full_name'], + avatar_thumb: user['profile_pic_url'], + verified: !!user['is_verified'], + is_private: !!user['is_private'] + }; + }; + + const add_comment = function (comment, parent_comment_id=null) { + if (!comment || typeof comment !== "object") { + return; + } + + const comment_id = comment['pk'] || comment['id']; + const text = comment['text']; + const user = normalise_user(comment['user'] || comment['owner']); + const post_id = comment['media_id'] || comment['media_pk'] || post_id_from_url; + + if (!comment_id || !text || !user || !post_id) { + return; + } + + if (seen.has(String(comment_id))) { + return; + } + seen.add(String(comment_id)); + + comment['id'] = String(comment_id); + comment['comment_id'] = String(comment_id); + comment['text'] = text; + comment['user'] = user; + comment['post_id'] = String(post_id); + comment['post_shortcode'] = post_shortcode; + comment['post_url'] = post_url; + comment['parent_comment_id'] = parent_comment_id ? String(parent_comment_id) : null; + comment['thread_id'] = String(post_id); + comment['_zs_comment_parent_id'] = parent_comment_id ? String(parent_comment_id) : String(post_id); + comment['_zs_comment_thread_id'] = String(post_id); + comment['_zs_comment_post_id'] = String(post_id); + + comments.push(comment); + }; + + const traverse = function (obj, parent_comment_id=null) { + if (!obj || typeof obj !== "object") { + return; + } + + if (Array.isArray(obj)) { + for (const item of obj) { + traverse(item, parent_comment_id); + } + return; + } + + if ((obj['pk'] || obj['id']) && obj['text'] && (obj['user'] || obj['owner'])) { + add_comment(obj, parent_comment_id); + + const comment_id = obj['pk'] || obj['id']; + for (const replies_key of ['child_comments', 'preview_child_comments', 'inline_child_comments']) { + if (Array.isArray(obj[replies_key])) { + for (const reply of obj[replies_key]) { + traverse(reply, comment_id); + } + } + } + return; + } + + for (let property in obj) { + if (!obj.hasOwnProperty(property) || !obj[property]) { + continue; + } + + if (property === 'comments' && Array.isArray(obj[property])) { + for (const comment of obj[property]) { + traverse(comment, parent_comment_id); + } + } else if (property === 'edges' && Array.isArray(obj[property])) { + for (const edge of obj[property]) { + traverse(edge && edge['node'] ? edge['node'] : edge, parent_comment_id); + } + } else if (typeof obj[property] === "object") { + traverse(obj[property], parent_comment_id); + } + } + }; + + traverse(data); + return comments; + }, + 'instagram-comments' +); diff --git a/modules/twitter-comments.js b/modules/twitter-comments.js new file mode 100644 index 0000000..e3915ad --- /dev/null +++ b/modules/twitter-comments.js @@ -0,0 +1,264 @@ +zeeschuimer.register_module( + 'X/Twitter (comments)', + 'x.com', + function (response, source_platform_url, source_url) { + let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, ''); + const root_tweet_id_match = source_platform_url.match(/\/status\/(\d+)/); + const root_tweet_id = root_tweet_id_match ? root_tweet_id_match[1] : null; + const looks_like_tweet_request = source_url.indexOf('TweetDetail') >= 0 + || source_url.indexOf('tweetdetail') >= 0 + || source_url.indexOf('/graphql/') >= 0 + || source_url.indexOf('/i/api/') >= 0; + + if (!["x.com"].includes(domain)) { + return []; + } + + // X frequently changes the exact GraphQL operation name used to load + // replies. When the user is on a /status/... page, be permissive about + // which request may contain the thread payload. + if (!root_tweet_id && !looks_like_tweet_request) { + return []; + } + + let data; + try { + data = JSON.parse(response); + } catch (SyntaxError) { + return []; + } + + let comments = []; + let seen = new Set(); + + const normalise_tweet = function (tweet) { + if (!tweet || tweet['__typename'] === 'TweetUnavailable') { + return null; + } + + if ('tweet' in tweet) { + tweet = tweet['tweet']; + } + + if (!tweet['legacy']) { + return null; + } + + const tweet_id = tweet['legacy']['id_str'] || tweet['rest_id']; + if (!tweet_id) { + return null; + } + + tweet['id'] = tweet_id; + return tweet; + }; + + const normalise_user = function (tweet) { + const user = tweet['core'] && tweet['core']['user_results'] && tweet['core']['user_results']['result']; + if (user) { + const core = user['core'] || {}; + const legacy = user['legacy'] || {}; + const avatar = user['avatar'] || {}; + + return { + id: user['rest_id'] || user['id'], + unique_id: core['screen_name'], + nickname: core['name'], + signature: legacy['description'], + avatar_thumb: avatar['image_url'] || legacy['profile_image_url_https'], + verified: !!(user['verification'] && user['verification']['verified']), + verified_type: user['verification'] ? user['verification']['verified_type'] : undefined, + follower_count: legacy['followers_count'], + following_count: legacy['friends_count'] + }; + } + + const fallback_user = tweet['user'] || null; + if (!fallback_user) { + return null; + } + + return { + id: fallback_user['id_str'] || fallback_user['id'], + unique_id: fallback_user['screen_name'], + nickname: fallback_user['name'], + signature: fallback_user['description'], + avatar_thumb: fallback_user['profile_image_url_https'], + verified: !!fallback_user['verified'], + follower_count: fallback_user['followers_count'], + following_count: fallback_user['friends_count'] + }; + }; + + const is_comment = function (tweet) { + const legacy = tweet['legacy']; + const tweet_id = String(tweet['id']); + const conversation_id = legacy['conversation_id_str'] ? String(legacy['conversation_id_str']) : null; + const reply_to_id = legacy['in_reply_to_status_id_str'] ? String(legacy['in_reply_to_status_id_str']) : null; + + if (root_tweet_id) { + return tweet_id !== root_tweet_id + && (conversation_id === root_tweet_id || reply_to_id === root_tweet_id); + } + + return !!reply_to_id; + }; + + const add_tweet = function (tweet) { + tweet = normalise_tweet(tweet); + if (!tweet || !is_comment(tweet)) { + return; + } + + const tweet_id = String(tweet['id']); + if (seen.has(tweet_id)) { + return; + } + seen.add(tweet_id); + + const parent_id = tweet['legacy']['in_reply_to_status_id_str'] || null; + const post_id = root_tweet_id || tweet['legacy']['conversation_id_str'] || parent_id; + const author = normalise_user(tweet); + + // Keep the original Twitter/X payload, but expose TikTok-like fields + // for downstream analysis pipelines that need text, author and post id. + tweet['comment_id'] = tweet_id; + tweet['text'] = tweet['legacy']['full_text'] || tweet['legacy']['text'] || ''; + tweet['user'] = author; + tweet['post_id'] = post_id; + tweet['post_url'] = post_id ? 'https://x.com/i/web/status/' + post_id : null; + tweet['parent_comment_id'] = parent_id && parent_id !== post_id ? parent_id : null; + tweet['thread_id'] = tweet['legacy']['conversation_id_str'] || post_id; + tweet['_zs_comment_parent_id'] = parent_id || post_id; + tweet['_zs_comment_thread_id'] = post_id; + tweet['_zs_comment_post_id'] = post_id; + comments.push(tweet); + }; + + const add_from_item_content = function (item_content) { + if (!item_content || !item_content['tweet_results']) { + return; + } + + add_tweet(item_content['tweet_results']['result']); + }; + + const collect_tweet_candidates = function (obj) { + if (!obj || typeof obj !== "object") { + return; + } + + if (Array.isArray(obj)) { + for (const item of obj) { + collect_tweet_candidates(item); + } + return; + } + + // Direct tweet-like object + if ( + obj['legacy'] + && typeof obj['legacy'] === 'object' + && (obj['rest_id'] || obj['id'] || obj['legacy']['id_str']) + ) { + add_tweet(obj); + } + + // Common GraphQL wrapper + if (obj['tweet_results'] && obj['tweet_results']['result']) { + add_tweet(obj['tweet_results']['result']); + } + + // Some responses wrap the tweet one level deeper + if (obj['result'] && typeof obj['result'] === 'object') { + const result = obj['result']; + if ( + result['legacy'] + || result['tweet'] + || result['tweet_results'] + || result['__typename'] === 'Tweet' + || result['__typename'] === 'TweetWithVisibilityResults' + ) { + collect_tweet_candidates(result); + } + } + + if (obj['tweet'] && typeof obj['tweet'] === 'object') { + collect_tweet_candidates(obj['tweet']); + } + + for (let property in obj) { + if (!obj.hasOwnProperty(property) || !obj[property]) { + continue; + } + + if (typeof obj[property] === "object") { + collect_tweet_candidates(obj[property]); + } + } + }; + + const traverse = function (obj) { + for (let property in obj) { + let child = obj[property]; + if (!child) { + continue; + } + + if ( + ( + (child.hasOwnProperty('type') && child['type'] === 'TimelineAddEntries') + || (!child.hasOwnProperty('type') && Object.keys(child).length === 1) + ) + && child.hasOwnProperty('entries') + ) { + for (let entry in child['entries']) { + entry = child['entries'][entry]; + if (!entry['content']) { + continue; + } + + if ('itemContent' in entry['content']) { + add_from_item_content(entry['content']['itemContent']); + } else if (entry['content']['content'] && entry['content']['content']['itemContent']) { + add_from_item_content(entry['content']['content']['itemContent']); + } else if ('__typename' in entry['content'] && entry['content']['__typename'] === 'TimelineTimelineModule') { + for (const item of entry['content']['items']) { + const item_content = item['item'] && item['item']['itemContent']; + if (!item_content || !['Tweet', 'TimelineTweet'].includes(item_content['__typename'])) { + continue; + } + + add_from_item_content(item_content); + } + } else if (entry['entryId'] && data['globalObjects'] && data['globalObjects']['tweets']) { + let tweet_id = null; + if (entry['entryId'].indexOf('tweet-') === 0) { + tweet_id = entry['entryId'].split('-')[1]; + } else if (entry['entryId'].indexOf('sq-I-t-') === 0) { + tweet_id = entry['entryId'].split('-')[3]; + } + + if (tweet_id && data['globalObjects']['tweets'][tweet_id]) { + add_tweet({ + id: tweet_id, + legacy: data['globalObjects']['tweets'][tweet_id], + user: data['globalObjects']['users'] + ? data['globalObjects']['users'][data['globalObjects']['tweets'][tweet_id]['user_id_str']] + : null + }); + } + } + } + } else if (typeof (child) === "object") { + traverse(child); + } + } + }; + + traverse(data); + collect_tweet_candidates(data); + return comments; + }, + 'twitter-comments' +); diff --git a/modules/twitter.js b/modules/twitter.js index 0a36282..66e1301 100644 --- a/modules/twitter.js +++ b/modules/twitter.js @@ -3,6 +3,8 @@ zeeschuimer.register_module( 'x.com', function (response, source_platform_url, source_url) { let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, ''); + const root_tweet_id_match = source_platform_url.match(/\/status\/(\d+)/); + const root_tweet_id = root_tweet_id_match ? root_tweet_id_match[1] : null; if ( !["x.com"].includes(domain) @@ -37,6 +39,51 @@ zeeschuimer.register_module( // One of the 'instructions' is to add entries to the timeline, this is what we are interested in because what // is added to the timeline are the tweets! // So find those instructions in the object, and reconstruct the tweets from there + const normalise_tweet = function (tweet) { + if (!tweet || tweet['__typename'] === 'TweetUnavailable') { + return null; + } + + if ('tweet' in tweet) { + tweet = tweet['tweet']; + } + + if (!tweet['legacy']) { + return null; + } + + tweet['id'] = tweet['legacy']['id_str'] || tweet['rest_id']; + return tweet['id'] ? tweet : null; + }; + + const should_include_as_post = function (tweet) { + tweet = normalise_tweet(tweet); + if (!tweet) { + return false; + } + + if (!root_tweet_id) { + return true; + } + + const tweet_id = String(tweet['id']); + const legacy = tweet['legacy'] || {}; + const conversation_id = legacy['conversation_id_str'] ? String(legacy['conversation_id_str']) : null; + const reply_to_id = legacy['in_reply_to_status_id_str'] ? String(legacy['in_reply_to_status_id_str']) : null; + + // On a single-tweet thread page, keep the root post in the posts + // stream and leave replies to the dedicated comments module. + if (tweet_id === root_tweet_id) { + return true; + } + + if (conversation_id === root_tweet_id || reply_to_id === root_tweet_id || !!reply_to_id) { + return false; + } + + return true; + }; + let traverse = function (obj) { for (let property in obj) { let child = obj[property]; @@ -69,11 +116,11 @@ zeeschuimer.register_module( continue; } - if('tweet' in tweet) { - // sometimes this is nested once more, for some reason - tweet = tweet['tweet']; + tweet = normalise_tweet(tweet); + if (!tweet || !should_include_as_post(tweet)) { + continue; } - tweet['id'] = tweet['legacy']['id_str']; + // distinguish tweets that were included because they were "promoted" from // those that are actually part of the user/home timeline or search result. // assume a tweet was promoted if itemContent has promotedMetadata @@ -87,7 +134,12 @@ zeeschuimer.register_module( }).map(item => { return item['item']['itemContent']['tweet_results']['result'] })) { - tweets.push({...reply_tweet, id: parseInt(reply_tweet['rest_id'])}); + const tweet = normalise_tweet(reply_tweet); + if (!tweet || !should_include_as_post(tweet)) { + continue; + } + + tweets.push(tweet); } } else { // in other cases this object only contains a reference to the full tweet, which is in turn @@ -116,7 +168,9 @@ zeeschuimer.register_module( // the user is also stored as a reference - so add the user data to the tweet tweet['user'] = data['globalObjects']['users'][tweet['legacy']['user_id_str']] - tweets.push(tweet); + if (should_include_as_post(tweet)) { + tweets.push(tweet); + } } } @@ -130,4 +184,4 @@ zeeschuimer.register_module( return tweets; }, 'twitter.com' -); \ No newline at end of file +); diff --git a/popup/interface.html b/popup/interface.html index 356f2b5..ce5e8cd 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -1,6 +1,6 @@ - Zeeschuimer + Pesquisa Social @@ -381,8 +381,8 @@
-

Zeeschuimer

- v1.13.6 +

Pesquisa Social

+ v1.13.8

Captured data objects

@@ -457,15 +457,12 @@

Import from NDJSON

- \ No newline at end of file + diff --git a/popup/interface.js b/popup/interface.js index 2dc0a13..5e917b4 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -4,6 +4,7 @@ var xhr; var is_uploading = false; const downloadUrls = new Map(); const duplicateBehaviorKey = 'zs-duplicate-behavior'; +const mediaDownloadPlatforms = ['tiktok.com', 'instagram.com', 'twitter.com']; /** * StreamSaver init @@ -132,7 +133,7 @@ function activate_buttons() { button.setAttribute('title', ''); } - } else if(button.classList.contains('download-ndjson') || button.classList.contains('reset')) { + } else if(button.classList.contains('download-ndjson') || button.classList.contains('download-media-zip') || button.classList.contains('reset')) { new_status = !(items > 0); } @@ -171,7 +172,7 @@ function update_icon() { } /** - * Get Zeeschuimer stats + * Get extension stats * * Loads the amount of items collected, etc. This function is called * periodically to keep the numbers in the interface updated as items are @@ -238,6 +239,10 @@ async function get_stats() { "data-platform": platform, "class": "download-ndjson" }, ".ndjson"); + let media_button = createElement("button", { + "data-platform": platform, + "class": "download-media-zip" + }, ".zip", "photo-film"); let fourcat_button = createElement("button", { "data-platform": platform, "class": "upload-to-4cat", @@ -245,6 +250,9 @@ async function get_stats() { actions.appendChild(clear_button); actions.appendChild(download_button); + if (mediaDownloadPlatforms.includes(platform)) { + actions.appendChild(media_button); + } actions.appendChild(fourcat_button); row.appendChild(actions); @@ -338,6 +346,31 @@ async function button_handler(event) { event.target.classList.remove('loading'); + } else if (event.target.matches('.download-media-zip')) { + let platform = event.target.getAttribute('data-platform'); + let date = new Date(); + event.target.classList.add('loading'); + status.innerText = 'Creating media ZIP...'; + + try { + let blob = await get_media_zip_blob(platform, function (current, total) { + status.innerText = 'Downloading media: ' + current + '/' + total; + }); + let filename = 'zeeschuimer-media-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.zip'; + const downloadUrl = window.URL.createObjectURL(blob); + const downloadId = await browser.downloads.download({ + url: downloadUrl, + filename: filename, + conflictAction: 'uniquify' + }); + downloadUrls.set(downloadId, downloadUrl); + status.innerText = ''; + } catch (e) { + status.innerText = 'Could not create media ZIP: ' + e.message; + } + + event.target.classList.remove('loading'); + } else if (event.target.matches('.upload-to-4cat')) { let platform = event.target.getAttribute('data-platform'); status.innerText = 'Creating data file for uploading...'; @@ -540,7 +573,7 @@ const upload_poll = { }, /** - * Add dataset to Zeeschuimer history + * Add dataset to upload history * * @param progress * @returns {Promise} @@ -574,6 +607,519 @@ async function get_blob(platform) { return new Blob(ndjson, {type: 'application/x-ndjson'}); } +/** + * Get a ZIP with all downloadable media for a platform. + * + * The ZIP contains a manifest.json file mapping each downloaded file back to + * the stored item and platform post it was extracted from. + * + * @param platform + * @param progress_callback + * @returns {Promise} + */ +async function get_media_zip_blob(platform, progress_callback=function(){}) { + let media_items = []; + + await iterate_items(platform, function(item) { + const urls = extract_media_urls(item); + urls.forEach((media, index) => { + media_items.push(Object.assign({}, media, { + item: item, + index: index + 1 + })); + }); + }); + + if (media_items.length === 0) { + throw new Error('no media URLs found for this platform'); + } + + let zip_entries = []; + let manifest = []; + let seen_filenames = new Set(); + + for (let index = 0; index < media_items.length; index++) { + const media = media_items[index]; + progress_callback(index + 1, media_items.length); + + let response = null; + let media_url = media.url; + let last_error = null; + const candidate_urls = media.alternate_urls && media.alternate_urls.length > 0 ? media.alternate_urls : [media.url]; + for (const candidate_url of candidate_urls) { + try { + response = await fetch(candidate_url, {credentials: 'include'}); + } catch (e) { + last_error = 'fetch failed: ' + e.message; + continue; + } + + if (response.ok) { + const candidate_content_type = response.headers.get('content-type') || ''; + if (media.type === 'video' && candidate_content_type.toLowerCase().indexOf('image/') === 0) { + last_error = 'unexpected image response for video URL'; + response = null; + continue; + } + media_url = candidate_url; + break; + } + + last_error = 'HTTP ' + response.status; + response = null; + } + + if (!response) { + manifest.push(media_manifest_entry(media, null, last_error)); + continue; + } + + const blob = await response.blob(); + const array_buffer = await blob.arrayBuffer(); + const content_type = response.headers.get('content-type') || blob.type || media.content_type || ''; + media.url = media_url; + let filename = media_filename(media, content_type); + filename = uniquify_filename(filename, seen_filenames); + seen_filenames.add(filename); + + zip_entries.push({ + filename: filename, + data: new Uint8Array(array_buffer) + }); + manifest.push(media_manifest_entry(media, filename, null, content_type)); + } + + zip_entries.unshift({ + filename: 'manifest.json', + data: new TextEncoder().encode(JSON.stringify(manifest, null, 2)) + }); + + return create_zip_blob(zip_entries); +} + +/** + * Extract media URLs from platform item data. + * + * @param item + * @returns {Array} + */ +function extract_media_urls(item) { + if (!item || !item.data) { + return []; + } + + if (item.source_platform === 'tiktok.com') { + return extract_tiktok_media_urls(item); + } + + if (item.source_platform === 'instagram.com') { + return extract_instagram_media_urls(item); + } + + if (item.source_platform === 'twitter.com') { + return extract_twitter_media_urls(item); + } + + return []; +} + +function extract_tiktok_media_urls(item) { + const data = item.data; + let urls = []; + + const add_url_list = function (url_list, type, content_type) { + if (typeof url_list === 'string' && url_list.length > 0) { + urls.push({ + url: url_list, + alternate_urls: [url_list], + type: type, + content_type: content_type, + post_id: data.id || item.item_id + }); + return; + } + + if (!Array.isArray(url_list) || url_list.length === 0) { + return; + } + urls.push({ + url: url_list[0], + alternate_urls: url_list, + type: type, + content_type: content_type, + post_id: data.id || item.item_id + }); + }; + + const add_best_bitrate_video = function () { + if (!Array.isArray(data.video.bitrateInfo)) { + return; + } + + const bitrate = data.video.bitrateInfo + .filter(info => info.PlayAddr && Array.isArray(info.PlayAddr.UrlList) && info.PlayAddr.UrlList.length > 0) + .sort((a, b) => (b.Bitrate || 0) - (a.Bitrate || 0))[0]; + if (bitrate) { + const url_list = preferred_tiktok_video_urls(bitrate.PlayAddr.UrlList); + add_url_list(url_list, 'video', 'video/mp4'); + } + }; + + if (data.video) { + add_best_bitrate_video(); + if (urls.length === 0) { + const play_addr = data.video.playAddr && (data.video.playAddr.urlList || data.video.playAddr); + add_url_list(preferred_tiktok_video_urls(play_addr), 'video', 'video/mp4'); + } + if (urls.length === 0) { + const download_addr = data.video.downloadAddr && (data.video.downloadAddr.urlList || data.video.downloadAddr); + add_url_list(preferred_tiktok_video_urls(download_addr), 'video', 'video/mp4'); + } + } + + if (urls.length === 0 && data.imagePost && Array.isArray(data.imagePost.images)) { + data.imagePost.images.forEach(image => { + add_url_list(image.imageURL && image.imageURL.urlList, 'image', 'image/jpeg'); + }); + } + + return dedupe_media_urls(urls); +} + +function preferred_tiktok_video_urls(urls) { + if (typeof urls === 'string') { + return urls; + } + + if (!Array.isArray(urls)) { + return urls; + } + + const tiktok_play_urls = urls.filter(url => { + return typeof url === 'string' && url.indexOf('www.tiktok.com/aweme/v1/play') >= 0; + }); + return tiktok_play_urls.length > 0 ? tiktok_play_urls.concat(urls.filter(url => !tiktok_play_urls.includes(url))) : urls; +} + +function extract_instagram_media_urls(item) { + const data = item.data; + let urls = []; + + const add_instagram_media = function (media, fallback_post_id) { + const post_id = media.id || fallback_post_id || data.id || item.item_id; + + if (Array.isArray(media.video_versions) && media.video_versions.length > 0) { + const best_video = media.video_versions + .filter(version => version.url) + .sort((a, b) => (b.width || 0) - (a.width || 0))[0]; + if (best_video) { + urls.push({ + url: best_video.url, + type: 'video', + content_type: 'video/mp4', + post_id: post_id + }); + } + } + + const candidates = media.image_versions2 && Array.isArray(media.image_versions2.candidates) + ? media.image_versions2.candidates + : []; + const best_image = candidates + .filter(candidate => candidate.url) + .sort((a, b) => (b.width || 0) - (a.width || 0))[0]; + if (best_image) { + urls.push({ + url: best_image.url, + type: 'image', + content_type: 'image/jpeg', + post_id: post_id + }); + } + }; + + add_instagram_media(data); + if (Array.isArray(data.carousel_media)) { + data.carousel_media.forEach(media => add_instagram_media(media, data.id || item.item_id)); + } + + return dedupe_media_urls(urls); +} + +function extract_twitter_media_urls(item) { + const data = item.data; + const legacy = data.legacy || {}; + const entities = legacy.extended_entities || legacy.entities || {}; + let urls = []; + + if (!Array.isArray(entities.media)) { + return []; + } + + entities.media.forEach(media => { + const post_id = data.id || data.rest_id || legacy.id_str || item.item_id; + if (media.video_info && Array.isArray(media.video_info.variants)) { + const best_video = media.video_info.variants + .filter(variant => variant.url && (!variant.content_type || variant.content_type.indexOf('video/') === 0)) + .sort((a, b) => (b.bitrate || 0) - (a.bitrate || 0))[0]; + if (best_video) { + urls.push({ + url: best_video.url, + type: 'video', + content_type: best_video.content_type || 'video/mp4', + post_id: post_id + }); + return; + } + } + + const image_url = media.media_url_https || media.media_url; + if (image_url) { + urls.push({ + url: image_url + (image_url.indexOf('?') >= 0 ? '&' : '?') + 'name=orig', + type: 'image', + content_type: 'image/jpeg', + post_id: post_id + }); + } + }); + + return dedupe_media_urls(urls); +} + +function dedupe_media_urls(media_urls) { + let seen = new Set(); + return media_urls.filter(media => { + if (!media.url || seen.has(media.url)) { + return false; + } + seen.add(media.url); + return true; + }); +} + +function media_manifest_entry(media, filename, error=null, content_type=null) { + const data = media.item.data || {}; + const post_id = String(media.post_id || data.id || data.rest_id || media.item.item_id); + const post_author = media_post_author(media.item); + return { + filename: filename, + error: error, + source_platform: media.item.source_platform, + item_id: media.item.item_id, + post_id: post_id, + post_url: media_post_url(media.item, post_id), + captured_page_url: media.item.source_platform_url, + post_author: post_author, + media_type: media.type, + media_url: media.url, + content_type: content_type || media.content_type || null + }; +} + +function media_post_url(item, post_id) { + const data = item.data || {}; + + if (item.source_platform === 'tiktok.com') { + const author = data.author || {}; + const username = author.uniqueId || author.unique_id || author.nickname; + if (username && post_id) { + return 'https://www.tiktok.com/@' + username + '/video/' + post_id; + } + } + + if (item.source_platform === 'instagram.com') { + if (data.code) { + const product_type = data.product_type || ''; + const media_type = data.media_type; + const path = product_type === 'clips' || media_type === 2 ? 'reel' : 'p'; + return 'https://www.instagram.com/' + path + '/' + data.code + '/'; + } + if (data.post_url) { + return data.post_url; + } + } + + if (item.source_platform === 'twitter.com') { + if (post_id) { + return 'https://x.com/i/web/status/' + post_id; + } + if (data.post_url) { + return data.post_url; + } + } + + return data.post_url || item.source_platform_url; +} + +function media_post_author(item) { + const data = item.data || {}; + + if (item.source_platform === 'tiktok.com' && data.author) { + const author_id = data.author.id || data.author.uid || data.author.secUid || null; + return { + id: author_id ? String(author_id) : null, + unique_id: data.author.uniqueId || data.author.unique_id || null, + nickname: data.author.nickname || null + }; + } + + if (item.source_platform === 'instagram.com' && data.user) { + const author_id = data.user.pk || data.user.pk_id || data.user.id || null; + return { + id: author_id ? String(author_id) : null, + unique_id: data.user.username || null, + nickname: data.user.full_name || null + }; + } + + if (item.source_platform === 'twitter.com') { + const user = data.core && data.core.user_results && data.core.user_results.result; + if (user) { + const author_id = user.rest_id || user.id || null; + return { + id: author_id ? String(author_id) : null, + unique_id: user.core ? user.core.screen_name : null, + nickname: user.core ? user.core.name : null + }; + } + } + + return null; +} + +function media_filename(media, content_type) { + const extension = extension_from_content_type(content_type) || extension_from_url(media.url) || (media.type === 'video' ? 'mp4' : 'jpg'); + const post_id = safe_filename(media.post_id || media.item.item_id || 'post'); + const media_type = safe_filename(media.type || 'media'); + return post_id + '/' + post_id + '-' + String(media.index).padStart(2, '0') + '-' + media_type + '.' + extension; +} + +function extension_from_content_type(content_type) { + if (!content_type) { + return ''; + } + content_type = content_type.split(';')[0].trim().toLowerCase(); + return { + 'image/jpeg': 'jpg', + 'image/jpg': 'jpg', + 'image/png': 'png', + 'image/webp': 'webp', + 'image/gif': 'gif', + 'video/mp4': 'mp4', + 'video/quicktime': 'mov' + }[content_type] || ''; +} + +function extension_from_url(url) { + try { + const pathname = new URL(url).pathname; + const match = pathname.match(/\.([a-zA-Z0-9]{2,5})$/); + return match ? match[1].toLowerCase() : ''; + } catch (e) { + return ''; + } +} + +function safe_filename(value) { + return String(value).replace(/[^a-zA-Z0-9._-]+/g, '_').replace(/^_+|_+$/g, '') || 'item'; +} + +function uniquify_filename(filename, seen_filenames) { + if (!seen_filenames.has(filename)) { + return filename; + } + + const dot = filename.lastIndexOf('.'); + const base = dot >= 0 ? filename.slice(0, dot) : filename; + const extension = dot >= 0 ? filename.slice(dot) : ''; + let counter = 2; + let candidate = base + '-' + counter + extension; + while (seen_filenames.has(candidate)) { + counter += 1; + candidate = base + '-' + counter + extension; + } + return candidate; +} + +/** + * Create an uncompressed ZIP file. + * + * @param entries Array of {filename, data} + * @returns {Blob} + */ +function create_zip_blob(entries) { + let chunks = []; + let central_directory = []; + let offset = 0; + + entries.forEach(entry => { + const filename = new TextEncoder().encode(entry.filename); + const data = entry.data; + const crc = crc32(data); + const local_header = zip_header(0x04034b50, [ + 20, 0, 0, 0, 0, crc, data.length, data.length, filename.length, 0 + ], [2, 2, 2, 2, 2, 4, 4, 4, 2, 2]); + + chunks.push(local_header, filename, data); + + const central_header = zip_header(0x02014b50, [ + 20, 20, 0, 0, 0, 0, crc, data.length, data.length, filename.length, 0, 0, 0, 0, 0, offset + ], [2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 4, 4]); + central_directory.push(central_header, filename); + offset += local_header.length + filename.length + data.length; + }); + + const central_start = offset; + let central_size = 0; + central_directory.forEach(chunk => { + central_size += chunk.length; + }); + + const end_header = zip_header(0x06054b50, [ + 0, 0, entries.length, entries.length, central_size, central_start, 0 + ], [2, 2, 2, 2, 4, 4, 2]); + + return new Blob([...chunks, ...central_directory, end_header], {type: 'application/zip'}); +} + +function zip_header(signature, values, sizes) { + const length = 4 + sizes.reduce((sum, size) => sum + size, 0); + let buffer = new ArrayBuffer(length); + let view = new DataView(buffer); + let offset = 0; + view.setUint32(offset, signature, true); + offset += 4; + values.forEach((value, index) => { + if (sizes[index] === 2) { + view.setUint16(offset, value, true); + } else { + view.setUint32(offset, value >>> 0, true); + } + offset += sizes[index]; + }); + return new Uint8Array(buffer); +} + +function crc32(data) { + if (!crc32.table) { + crc32.table = []; + for (let n = 0; n < 256; n++) { + let c = n; + for (let k = 0; k < 8; k++) { + c = ((c & 1) ? (0xEDB88320 ^ (c >>> 1)) : (c >>> 1)); + } + crc32.table[n] = c >>> 0; + } + } + + let crc = 0 ^ (-1); + for (let i = 0; i < data.length; i++) { + crc = (crc >>> 8) ^ crc32.table[(crc ^ data[i]) & 0xFF]; + } + return (crc ^ (-1)) >>> 0; +} + /** * Use StreamSaver to download a Blob * @@ -696,7 +1242,7 @@ document.addEventListener('DOMContentLoaded', async function () { const current_version = version_container.innerText; const known_version = await background.browser.storage.local.get('zs-version'); if(!known_version || current_version !== known_version['zs-version']) { - const version_alert = createElement('span', {'class': 'popup new-version'}, 'Zeeschuimer has been updated to a new version! You can read the release notes via this link.'); + const version_alert = createElement('span', {'class': 'popup new-version'}, 'Pesquisa Social has been updated to a new version! You can read the release notes via this link.'); const ok_button = createElement('button', {'class': 'close-popup'}, 'OK'); ok_button.addEventListener('click', async function(e) { await background.browser.storage.local.set({'zs-version': current_version}); @@ -721,4 +1267,4 @@ document.addEventListener('DOMContentLoaded', async function () { } browser.downloads.onChanged.addListener(downloadListener); -}); \ No newline at end of file +}); diff --git a/tests/package.json b/tests/package.json index dc3654c..57fce6a 100644 --- a/tests/package.json +++ b/tests/package.json @@ -3,7 +3,7 @@ "version": "1.0.0", "description": "Unit tests for Zeeschuimer duplicate handling logic", "scripts": { - "test": "jest", + "test": "node --experimental-vm-modules ./node_modules/jest/bin/jest.js", "test:watch": "jest --watch" }, "devDependencies": { diff --git a/tests/tests.json b/tests/tests.json index 2613f12..fe800ad 100644 --- a/tests/tests.json +++ b/tests/tests.json @@ -57,6 +57,15 @@ } } }, + "instagram-comments": { + "Individual post comments": { + "https://www.instagram.com/p/C1hWCZLPQ9T/": { + "expected": [1, 30], + "more-after-scroll": true, + "wait": 10 + } + } + }, "threads.net": { "Front page": { "https://www.threads.net/": { @@ -259,6 +268,14 @@ } } }, + "twitter-comments": { + "Single tweet with replies": { + "https://x.com/dril/status/247222360309121024": { + "expected": 10, + "more-after-scroll": true + } + } + }, "douyin.com": { "Front Page": { "https://www.douyin.com/discover": { @@ -410,4 +427,4 @@ } } } -} \ No newline at end of file +}