From cf897a75761be0e6a89ff29a4c1ea061cbd8be7b Mon Sep 17 00:00:00 2001 From: Danilo F Marinho Date: Fri, 24 Apr 2026 20:18:36 -0300 Subject: [PATCH 1/6] Add comment capture and media export support --- README.md | 4 +- manifest.json | 2 + modules/instagram-comments.js | 141 +++++++++ modules/twitter-comments.js | 159 ++++++++++ popup/interface.js | 550 +++++++++++++++++++++++++++++++++- tests/package.json | 2 +- tests/tests.json | 19 +- 7 files changed, 871 insertions(+), 6 deletions(-) create mode 100644 modules/instagram-comments.js create mode 100644 modules/twitter-comments.js diff --git a/README.md b/README.md index 85ff94c..040a1b9 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,8 @@ into your own analysis pipeline. Currently, it supports the following platforms: * [TikTok](https://www.tiktok.com) (posts and comments) -* [Instagram](https://www.instagram.com) (posts only) -* [X/Twitter](https://www.x.com) +* [Instagram](https://www.instagram.com) (posts, reels, and comments) +* [X/Twitter](https://www.x.com) (posts and comments) * [LinkedIn](https://www.linkedin.com) * [9gag](https://9gag.com) * [Imgur](https://imgur.com) diff --git a/manifest.json b/manifest.json index b598fa0..f5bb2d6 100644 --- a/manifest.json +++ b/manifest.json @@ -41,10 +41,12 @@ "modules/tiktok.js", "modules/tiktok-comments.js", "modules/instagram.js", + "modules/instagram-comments.js", "modules/linkedin.js", "modules/9gag.js", "modules/imgur.js", "modules/twitter.js", + "modules/twitter-comments.js", "modules/douyin.js", "modules/gab.js", "modules/truth.js", diff --git a/modules/instagram-comments.js b/modules/instagram-comments.js new file mode 100644 index 0000000..bae0cdd --- /dev/null +++ b/modules/instagram-comments.js @@ -0,0 +1,141 @@ +zeeschuimer.register_module( + 'Instagram (comments)', + 'instagram.com', + function (response, source_platform_url, source_url) { + let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, ''); + + if (!["instagram.com"].includes(domain)) { + return []; + } + + const lower_source_url = source_url.toLowerCase(); + const looks_like_comments_request = lower_source_url.indexOf('/comments') >= 0 + || lower_source_url.indexOf('comments') >= 0 + || lower_source_url.indexOf('comment') >= 0; + + if (!looks_like_comments_request) { + return []; + } + + let data; + try { + if (response.startsWith("for (;;);")) { + response = response.slice("for (;;);".length); + } + data = JSON.parse(response); + } catch (SyntaxError) { + return []; + } + + const media_id_match = source_url.match(/\/media\/([^\/?]+)\/comments/i); + const shortcode_match = source_platform_url.match(/\/(p|reel|reels)\/([^\/?#]+)/i); + const post_id_from_url = media_id_match ? media_id_match[1] : null; + const post_type = shortcode_match ? shortcode_match[1].replace('reels', 'reel') : 'p'; + const post_shortcode = shortcode_match ? shortcode_match[2] : null; + const post_url = post_shortcode ? 'https://www.instagram.com/' + post_type + '/' + post_shortcode + '/' : source_platform_url; + + let comments = []; + let seen = new Set(); + + const normalise_user = function (user) { + if (!user) { + return null; + } + + const user_id = user['pk'] || user['pk_id'] || user['id']; + + return { + id: user_id ? String(user_id) : undefined, + unique_id: user['username'], + nickname: user['full_name'], + avatar_thumb: user['profile_pic_url'], + verified: !!user['is_verified'], + is_private: !!user['is_private'] + }; + }; + + const add_comment = function (comment, parent_comment_id=null) { + if (!comment || typeof comment !== "object") { + return; + } + + const comment_id = comment['pk'] || comment['id']; + const text = comment['text']; + const user = normalise_user(comment['user'] || comment['owner']); + const post_id = comment['media_id'] || comment['media_pk'] || post_id_from_url; + + if (!comment_id || !text || !user || !post_id) { + return; + } + + if (seen.has(String(comment_id))) { + return; + } + seen.add(String(comment_id)); + + comment['id'] = String(comment_id); + comment['comment_id'] = String(comment_id); + comment['text'] = text; + comment['user'] = user; + comment['post_id'] = String(post_id); + comment['post_shortcode'] = post_shortcode; + comment['post_url'] = post_url; + comment['parent_comment_id'] = parent_comment_id ? String(parent_comment_id) : null; + comment['thread_id'] = String(post_id); + comment['_zs_comment_parent_id'] = parent_comment_id ? String(parent_comment_id) : String(post_id); + comment['_zs_comment_thread_id'] = String(post_id); + comment['_zs_comment_post_id'] = String(post_id); + + comments.push(comment); + }; + + const traverse = function (obj, parent_comment_id=null) { + if (!obj || typeof obj !== "object") { + return; + } + + if (Array.isArray(obj)) { + for (const item of obj) { + traverse(item, parent_comment_id); + } + return; + } + + if ((obj['pk'] || obj['id']) && obj['text'] && (obj['user'] || obj['owner'])) { + add_comment(obj, parent_comment_id); + + const comment_id = obj['pk'] || obj['id']; + for (const replies_key of ['child_comments', 'preview_child_comments', 'inline_child_comments']) { + if (Array.isArray(obj[replies_key])) { + for (const reply of obj[replies_key]) { + traverse(reply, comment_id); + } + } + } + return; + } + + for (let property in obj) { + if (!obj.hasOwnProperty(property) || !obj[property]) { + continue; + } + + if (property === 'comments' && Array.isArray(obj[property])) { + for (const comment of obj[property]) { + traverse(comment, parent_comment_id); + } + } else if (property === 'edges' && Array.isArray(obj[property])) { + for (const edge of obj[property]) { + traverse(edge && edge['node'] ? edge['node'] : edge, parent_comment_id); + } + } else if (typeof obj[property] === "object") { + traverse(obj[property], parent_comment_id); + } + } + }; + + traverse(data); + return comments; + }, + 'instagram-comments' +); diff --git a/modules/twitter-comments.js b/modules/twitter-comments.js new file mode 100644 index 0000000..58705fd --- /dev/null +++ b/modules/twitter-comments.js @@ -0,0 +1,159 @@ +zeeschuimer.register_module( + 'X/Twitter (comments)', + 'x.com', + function (response, source_platform_url, source_url) { + let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, ''); + + if (!["x.com"].includes(domain) || source_url.indexOf('TweetDetail') < 0) { + return []; + } + + let data; + try { + data = JSON.parse(response); + } catch (SyntaxError) { + return []; + } + + const root_tweet_id_match = source_platform_url.match(/\/status\/(\d+)/); + const root_tweet_id = root_tweet_id_match ? root_tweet_id_match[1] : null; + let comments = []; + + const normalise_tweet = function (tweet) { + if (!tweet || tweet['__typename'] === 'TweetUnavailable') { + return null; + } + + if ('tweet' in tweet) { + tweet = tweet['tweet']; + } + + if (!tweet['legacy']) { + return null; + } + + const tweet_id = tweet['legacy']['id_str'] || tweet['rest_id']; + if (!tweet_id) { + return null; + } + + tweet['id'] = tweet_id; + return tweet; + }; + + const normalise_user = function (tweet) { + const user = tweet['core'] && tweet['core']['user_results'] && tweet['core']['user_results']['result']; + if (!user) { + return null; + } + + const core = user['core'] || {}; + const legacy = user['legacy'] || {}; + const avatar = user['avatar'] || {}; + + return { + id: user['rest_id'] || user['id'], + unique_id: core['screen_name'], + nickname: core['name'], + signature: legacy['description'], + avatar_thumb: avatar['image_url'] || legacy['profile_image_url_https'], + verified: !!(user['verification'] && user['verification']['verified']), + verified_type: user['verification'] ? user['verification']['verified_type'] : undefined, + follower_count: legacy['followers_count'], + following_count: legacy['friends_count'] + }; + }; + + const is_comment = function (tweet) { + const legacy = tweet['legacy']; + const tweet_id = String(tweet['id']); + const conversation_id = legacy['conversation_id_str'] ? String(legacy['conversation_id_str']) : null; + const reply_to_id = legacy['in_reply_to_status_id_str'] ? String(legacy['in_reply_to_status_id_str']) : null; + + if (root_tweet_id) { + return tweet_id !== root_tweet_id + && (conversation_id === root_tweet_id || reply_to_id === root_tweet_id); + } + + return !!reply_to_id; + }; + + const add_tweet = function (tweet) { + tweet = normalise_tweet(tweet); + if (!tweet || !is_comment(tweet)) { + return; + } + + const tweet_id = String(tweet['id']); + const parent_id = tweet['legacy']['in_reply_to_status_id_str'] || null; + const post_id = root_tweet_id || tweet['legacy']['conversation_id_str'] || parent_id; + const author = normalise_user(tweet); + + // Keep the original Twitter/X payload, but expose TikTok-like fields + // for downstream analysis pipelines that need text, author and post id. + tweet['comment_id'] = tweet_id; + tweet['text'] = tweet['legacy']['full_text'] || tweet['legacy']['text'] || ''; + tweet['user'] = author; + tweet['post_id'] = post_id; + tweet['post_url'] = post_id ? 'https://x.com/i/web/status/' + post_id : null; + tweet['parent_comment_id'] = parent_id && parent_id !== post_id ? parent_id : null; + tweet['thread_id'] = tweet['legacy']['conversation_id_str'] || post_id; + tweet['_zs_comment_parent_id'] = parent_id || post_id; + tweet['_zs_comment_thread_id'] = post_id; + tweet['_zs_comment_post_id'] = post_id; + comments.push(tweet); + }; + + const traverse = function (obj) { + for (let property in obj) { + let child = obj[property]; + if (!child) { + continue; + } + + if ( + ( + (child.hasOwnProperty('type') && child['type'] === 'TimelineAddEntries') + || (!child.hasOwnProperty('type') && Object.keys(child).length === 1) + ) + && child.hasOwnProperty('entries') + ) { + for (let entry in child['entries']) { + entry = child['entries'][entry]; + if (!entry['content']) { + continue; + } + + if ('itemContent' in entry['content']) { + const item_content = entry['content']['itemContent']; + if (!item_content['tweet_results']) { + continue; + } + + add_tweet(item_content['tweet_results']['result']); + } else if ('__typename' in entry['content'] && entry['content']['__typename'] === 'TimelineTimelineModule') { + for (const item of entry['content']['items']) { + const item_content = item['item'] && item['item']['itemContent']; + if ( + !item_content + || !['Tweet', 'TimelineTweet'].includes(item_content['__typename']) + || !item_content['tweet_results'] + ) { + continue; + } + + add_tweet(item_content['tweet_results']['result']); + } + } + } + } else if (typeof (child) === "object") { + traverse(child); + } + } + }; + + traverse(data); + return comments; + }, + 'twitter-comments' +); diff --git a/popup/interface.js b/popup/interface.js index 2dc0a13..0fbf9b9 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -4,6 +4,7 @@ var xhr; var is_uploading = false; const downloadUrls = new Map(); const duplicateBehaviorKey = 'zs-duplicate-behavior'; +const mediaDownloadPlatforms = ['tiktok.com', 'instagram.com', 'twitter.com']; /** * StreamSaver init @@ -132,7 +133,7 @@ function activate_buttons() { button.setAttribute('title', ''); } - } else if(button.classList.contains('download-ndjson') || button.classList.contains('reset')) { + } else if(button.classList.contains('download-ndjson') || button.classList.contains('download-media-zip') || button.classList.contains('reset')) { new_status = !(items > 0); } @@ -238,6 +239,10 @@ async function get_stats() { "data-platform": platform, "class": "download-ndjson" }, ".ndjson"); + let media_button = createElement("button", { + "data-platform": platform, + "class": "download-media-zip" + }, ".zip", "photo-film"); let fourcat_button = createElement("button", { "data-platform": platform, "class": "upload-to-4cat", @@ -245,6 +250,9 @@ async function get_stats() { actions.appendChild(clear_button); actions.appendChild(download_button); + if (mediaDownloadPlatforms.includes(platform)) { + actions.appendChild(media_button); + } actions.appendChild(fourcat_button); row.appendChild(actions); @@ -338,6 +346,31 @@ async function button_handler(event) { event.target.classList.remove('loading'); + } else if (event.target.matches('.download-media-zip')) { + let platform = event.target.getAttribute('data-platform'); + let date = new Date(); + event.target.classList.add('loading'); + status.innerText = 'Creating media ZIP...'; + + try { + let blob = await get_media_zip_blob(platform, function (current, total) { + status.innerText = 'Downloading media: ' + current + '/' + total; + }); + let filename = 'zeeschuimer-media-' + platform + '-' + date.toISOString().split(".")[0].replace(/:/g, "") + '.zip'; + const downloadUrl = window.URL.createObjectURL(blob); + const downloadId = await browser.downloads.download({ + url: downloadUrl, + filename: filename, + conflictAction: 'uniquify' + }); + downloadUrls.set(downloadId, downloadUrl); + status.innerText = ''; + } catch (e) { + status.innerText = 'Could not create media ZIP: ' + e.message; + } + + event.target.classList.remove('loading'); + } else if (event.target.matches('.upload-to-4cat')) { let platform = event.target.getAttribute('data-platform'); status.innerText = 'Creating data file for uploading...'; @@ -574,6 +607,519 @@ async function get_blob(platform) { return new Blob(ndjson, {type: 'application/x-ndjson'}); } +/** + * Get a ZIP with all downloadable media for a platform. + * + * The ZIP contains a manifest.json file mapping each downloaded file back to + * the Zeeschuimer item and platform post it was extracted from. + * + * @param platform + * @param progress_callback + * @returns {Promise} + */ +async function get_media_zip_blob(platform, progress_callback=function(){}) { + let media_items = []; + + await iterate_items(platform, function(item) { + const urls = extract_media_urls(item); + urls.forEach((media, index) => { + media_items.push(Object.assign({}, media, { + item: item, + index: index + 1 + })); + }); + }); + + if (media_items.length === 0) { + throw new Error('no media URLs found for this platform'); + } + + let zip_entries = []; + let manifest = []; + let seen_filenames = new Set(); + + for (let index = 0; index < media_items.length; index++) { + const media = media_items[index]; + progress_callback(index + 1, media_items.length); + + let response = null; + let media_url = media.url; + let last_error = null; + const candidate_urls = media.alternate_urls && media.alternate_urls.length > 0 ? media.alternate_urls : [media.url]; + for (const candidate_url of candidate_urls) { + try { + response = await fetch(candidate_url, {credentials: 'include'}); + } catch (e) { + last_error = 'fetch failed: ' + e.message; + continue; + } + + if (response.ok) { + const candidate_content_type = response.headers.get('content-type') || ''; + if (media.type === 'video' && candidate_content_type.toLowerCase().indexOf('image/') === 0) { + last_error = 'unexpected image response for video URL'; + response = null; + continue; + } + media_url = candidate_url; + break; + } + + last_error = 'HTTP ' + response.status; + response = null; + } + + if (!response) { + manifest.push(media_manifest_entry(media, null, last_error)); + continue; + } + + const blob = await response.blob(); + const array_buffer = await blob.arrayBuffer(); + const content_type = response.headers.get('content-type') || blob.type || media.content_type || ''; + media.url = media_url; + let filename = media_filename(media, content_type); + filename = uniquify_filename(filename, seen_filenames); + seen_filenames.add(filename); + + zip_entries.push({ + filename: filename, + data: new Uint8Array(array_buffer) + }); + manifest.push(media_manifest_entry(media, filename, null, content_type)); + } + + zip_entries.unshift({ + filename: 'manifest.json', + data: new TextEncoder().encode(JSON.stringify(manifest, null, 2)) + }); + + return create_zip_blob(zip_entries); +} + +/** + * Extract media URLs from platform item data. + * + * @param item + * @returns {Array} + */ +function extract_media_urls(item) { + if (!item || !item.data) { + return []; + } + + if (item.source_platform === 'tiktok.com') { + return extract_tiktok_media_urls(item); + } + + if (item.source_platform === 'instagram.com') { + return extract_instagram_media_urls(item); + } + + if (item.source_platform === 'twitter.com') { + return extract_twitter_media_urls(item); + } + + return []; +} + +function extract_tiktok_media_urls(item) { + const data = item.data; + let urls = []; + + const add_url_list = function (url_list, type, content_type) { + if (typeof url_list === 'string' && url_list.length > 0) { + urls.push({ + url: url_list, + alternate_urls: [url_list], + type: type, + content_type: content_type, + post_id: data.id || item.item_id + }); + return; + } + + if (!Array.isArray(url_list) || url_list.length === 0) { + return; + } + urls.push({ + url: url_list[0], + alternate_urls: url_list, + type: type, + content_type: content_type, + post_id: data.id || item.item_id + }); + }; + + const add_best_bitrate_video = function () { + if (!Array.isArray(data.video.bitrateInfo)) { + return; + } + + const bitrate = data.video.bitrateInfo + .filter(info => info.PlayAddr && Array.isArray(info.PlayAddr.UrlList) && info.PlayAddr.UrlList.length > 0) + .sort((a, b) => (b.Bitrate || 0) - (a.Bitrate || 0))[0]; + if (bitrate) { + const url_list = preferred_tiktok_video_urls(bitrate.PlayAddr.UrlList); + add_url_list(url_list, 'video', 'video/mp4'); + } + }; + + if (data.video) { + add_best_bitrate_video(); + if (urls.length === 0) { + const play_addr = data.video.playAddr && (data.video.playAddr.urlList || data.video.playAddr); + add_url_list(preferred_tiktok_video_urls(play_addr), 'video', 'video/mp4'); + } + if (urls.length === 0) { + const download_addr = data.video.downloadAddr && (data.video.downloadAddr.urlList || data.video.downloadAddr); + add_url_list(preferred_tiktok_video_urls(download_addr), 'video', 'video/mp4'); + } + } + + if (urls.length === 0 && data.imagePost && Array.isArray(data.imagePost.images)) { + data.imagePost.images.forEach(image => { + add_url_list(image.imageURL && image.imageURL.urlList, 'image', 'image/jpeg'); + }); + } + + return dedupe_media_urls(urls); +} + +function preferred_tiktok_video_urls(urls) { + if (typeof urls === 'string') { + return urls; + } + + if (!Array.isArray(urls)) { + return urls; + } + + const tiktok_play_urls = urls.filter(url => { + return typeof url === 'string' && url.indexOf('www.tiktok.com/aweme/v1/play') >= 0; + }); + return tiktok_play_urls.length > 0 ? tiktok_play_urls.concat(urls.filter(url => !tiktok_play_urls.includes(url))) : urls; +} + +function extract_instagram_media_urls(item) { + const data = item.data; + let urls = []; + + const add_instagram_media = function (media, fallback_post_id) { + const post_id = media.id || fallback_post_id || data.id || item.item_id; + + if (Array.isArray(media.video_versions) && media.video_versions.length > 0) { + const best_video = media.video_versions + .filter(version => version.url) + .sort((a, b) => (b.width || 0) - (a.width || 0))[0]; + if (best_video) { + urls.push({ + url: best_video.url, + type: 'video', + content_type: 'video/mp4', + post_id: post_id + }); + } + } + + const candidates = media.image_versions2 && Array.isArray(media.image_versions2.candidates) + ? media.image_versions2.candidates + : []; + const best_image = candidates + .filter(candidate => candidate.url) + .sort((a, b) => (b.width || 0) - (a.width || 0))[0]; + if (best_image) { + urls.push({ + url: best_image.url, + type: 'image', + content_type: 'image/jpeg', + post_id: post_id + }); + } + }; + + add_instagram_media(data); + if (Array.isArray(data.carousel_media)) { + data.carousel_media.forEach(media => add_instagram_media(media, data.id || item.item_id)); + } + + return dedupe_media_urls(urls); +} + +function extract_twitter_media_urls(item) { + const data = item.data; + const legacy = data.legacy || {}; + const entities = legacy.extended_entities || legacy.entities || {}; + let urls = []; + + if (!Array.isArray(entities.media)) { + return []; + } + + entities.media.forEach(media => { + const post_id = data.id || data.rest_id || legacy.id_str || item.item_id; + if (media.video_info && Array.isArray(media.video_info.variants)) { + const best_video = media.video_info.variants + .filter(variant => variant.url && (!variant.content_type || variant.content_type.indexOf('video/') === 0)) + .sort((a, b) => (b.bitrate || 0) - (a.bitrate || 0))[0]; + if (best_video) { + urls.push({ + url: best_video.url, + type: 'video', + content_type: best_video.content_type || 'video/mp4', + post_id: post_id + }); + return; + } + } + + const image_url = media.media_url_https || media.media_url; + if (image_url) { + urls.push({ + url: image_url + (image_url.indexOf('?') >= 0 ? '&' : '?') + 'name=orig', + type: 'image', + content_type: 'image/jpeg', + post_id: post_id + }); + } + }); + + return dedupe_media_urls(urls); +} + +function dedupe_media_urls(media_urls) { + let seen = new Set(); + return media_urls.filter(media => { + if (!media.url || seen.has(media.url)) { + return false; + } + seen.add(media.url); + return true; + }); +} + +function media_manifest_entry(media, filename, error=null, content_type=null) { + const data = media.item.data || {}; + const post_id = String(media.post_id || data.id || data.rest_id || media.item.item_id); + const post_author = media_post_author(media.item); + return { + filename: filename, + error: error, + source_platform: media.item.source_platform, + item_id: media.item.item_id, + post_id: post_id, + post_url: media_post_url(media.item, post_id), + captured_page_url: media.item.source_platform_url, + post_author: post_author, + media_type: media.type, + media_url: media.url, + content_type: content_type || media.content_type || null + }; +} + +function media_post_url(item, post_id) { + const data = item.data || {}; + + if (item.source_platform === 'tiktok.com') { + const author = data.author || {}; + const username = author.uniqueId || author.unique_id || author.nickname; + if (username && post_id) { + return 'https://www.tiktok.com/@' + username + '/video/' + post_id; + } + } + + if (item.source_platform === 'instagram.com') { + if (data.code) { + const product_type = data.product_type || ''; + const media_type = data.media_type; + const path = product_type === 'clips' || media_type === 2 ? 'reel' : 'p'; + return 'https://www.instagram.com/' + path + '/' + data.code + '/'; + } + if (data.post_url) { + return data.post_url; + } + } + + if (item.source_platform === 'twitter.com') { + if (post_id) { + return 'https://x.com/i/web/status/' + post_id; + } + if (data.post_url) { + return data.post_url; + } + } + + return data.post_url || item.source_platform_url; +} + +function media_post_author(item) { + const data = item.data || {}; + + if (item.source_platform === 'tiktok.com' && data.author) { + const author_id = data.author.id || data.author.uid || data.author.secUid || null; + return { + id: author_id ? String(author_id) : null, + unique_id: data.author.uniqueId || data.author.unique_id || null, + nickname: data.author.nickname || null + }; + } + + if (item.source_platform === 'instagram.com' && data.user) { + const author_id = data.user.pk || data.user.pk_id || data.user.id || null; + return { + id: author_id ? String(author_id) : null, + unique_id: data.user.username || null, + nickname: data.user.full_name || null + }; + } + + if (item.source_platform === 'twitter.com') { + const user = data.core && data.core.user_results && data.core.user_results.result; + if (user) { + const author_id = user.rest_id || user.id || null; + return { + id: author_id ? String(author_id) : null, + unique_id: user.core ? user.core.screen_name : null, + nickname: user.core ? user.core.name : null + }; + } + } + + return null; +} + +function media_filename(media, content_type) { + const extension = extension_from_content_type(content_type) || extension_from_url(media.url) || (media.type === 'video' ? 'mp4' : 'jpg'); + const post_id = safe_filename(media.post_id || media.item.item_id || 'post'); + const media_type = safe_filename(media.type || 'media'); + return post_id + '/' + post_id + '-' + String(media.index).padStart(2, '0') + '-' + media_type + '.' + extension; +} + +function extension_from_content_type(content_type) { + if (!content_type) { + return ''; + } + content_type = content_type.split(';')[0].trim().toLowerCase(); + return { + 'image/jpeg': 'jpg', + 'image/jpg': 'jpg', + 'image/png': 'png', + 'image/webp': 'webp', + 'image/gif': 'gif', + 'video/mp4': 'mp4', + 'video/quicktime': 'mov' + }[content_type] || ''; +} + +function extension_from_url(url) { + try { + const pathname = new URL(url).pathname; + const match = pathname.match(/\.([a-zA-Z0-9]{2,5})$/); + return match ? match[1].toLowerCase() : ''; + } catch (e) { + return ''; + } +} + +function safe_filename(value) { + return String(value).replace(/[^a-zA-Z0-9._-]+/g, '_').replace(/^_+|_+$/g, '') || 'item'; +} + +function uniquify_filename(filename, seen_filenames) { + if (!seen_filenames.has(filename)) { + return filename; + } + + const dot = filename.lastIndexOf('.'); + const base = dot >= 0 ? filename.slice(0, dot) : filename; + const extension = dot >= 0 ? filename.slice(dot) : ''; + let counter = 2; + let candidate = base + '-' + counter + extension; + while (seen_filenames.has(candidate)) { + counter += 1; + candidate = base + '-' + counter + extension; + } + return candidate; +} + +/** + * Create an uncompressed ZIP file. + * + * @param entries Array of {filename, data} + * @returns {Blob} + */ +function create_zip_blob(entries) { + let chunks = []; + let central_directory = []; + let offset = 0; + + entries.forEach(entry => { + const filename = new TextEncoder().encode(entry.filename); + const data = entry.data; + const crc = crc32(data); + const local_header = zip_header(0x04034b50, [ + 20, 0, 0, 0, 0, crc, data.length, data.length, filename.length, 0 + ], [2, 2, 2, 2, 2, 4, 4, 4, 2, 2]); + + chunks.push(local_header, filename, data); + + const central_header = zip_header(0x02014b50, [ + 20, 20, 0, 0, 0, 0, crc, data.length, data.length, filename.length, 0, 0, 0, 0, 0, offset + ], [2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 4, 4]); + central_directory.push(central_header, filename); + offset += local_header.length + filename.length + data.length; + }); + + const central_start = offset; + let central_size = 0; + central_directory.forEach(chunk => { + central_size += chunk.length; + }); + + const end_header = zip_header(0x06054b50, [ + 0, 0, entries.length, entries.length, central_size, central_start, 0 + ], [2, 2, 2, 2, 4, 4, 2]); + + return new Blob([...chunks, ...central_directory, end_header], {type: 'application/zip'}); +} + +function zip_header(signature, values, sizes) { + const length = 4 + sizes.reduce((sum, size) => sum + size, 0); + let buffer = new ArrayBuffer(length); + let view = new DataView(buffer); + let offset = 0; + view.setUint32(offset, signature, true); + offset += 4; + values.forEach((value, index) => { + if (sizes[index] === 2) { + view.setUint16(offset, value, true); + } else { + view.setUint32(offset, value >>> 0, true); + } + offset += sizes[index]; + }); + return new Uint8Array(buffer); +} + +function crc32(data) { + if (!crc32.table) { + crc32.table = []; + for (let n = 0; n < 256; n++) { + let c = n; + for (let k = 0; k < 8; k++) { + c = ((c & 1) ? (0xEDB88320 ^ (c >>> 1)) : (c >>> 1)); + } + crc32.table[n] = c >>> 0; + } + } + + let crc = 0 ^ (-1); + for (let i = 0; i < data.length; i++) { + crc = (crc >>> 8) ^ crc32.table[(crc ^ data[i]) & 0xFF]; + } + return (crc ^ (-1)) >>> 0; +} + /** * Use StreamSaver to download a Blob * @@ -721,4 +1267,4 @@ document.addEventListener('DOMContentLoaded', async function () { } browser.downloads.onChanged.addListener(downloadListener); -}); \ No newline at end of file +}); diff --git a/tests/package.json b/tests/package.json index dc3654c..57fce6a 100644 --- a/tests/package.json +++ b/tests/package.json @@ -3,7 +3,7 @@ "version": "1.0.0", "description": "Unit tests for Zeeschuimer duplicate handling logic", "scripts": { - "test": "jest", + "test": "node --experimental-vm-modules ./node_modules/jest/bin/jest.js", "test:watch": "jest --watch" }, "devDependencies": { diff --git a/tests/tests.json b/tests/tests.json index 2613f12..fe800ad 100644 --- a/tests/tests.json +++ b/tests/tests.json @@ -57,6 +57,15 @@ } } }, + "instagram-comments": { + "Individual post comments": { + "https://www.instagram.com/p/C1hWCZLPQ9T/": { + "expected": [1, 30], + "more-after-scroll": true, + "wait": 10 + } + } + }, "threads.net": { "Front page": { "https://www.threads.net/": { @@ -259,6 +268,14 @@ } } }, + "twitter-comments": { + "Single tweet with replies": { + "https://x.com/dril/status/247222360309121024": { + "expected": 10, + "more-after-scroll": true + } + } + }, "douyin.com": { "Front Page": { "https://www.douyin.com/discover": { @@ -410,4 +427,4 @@ } } } -} \ No newline at end of file +} From c16790bd44af9248c8f4bfee74f4c62b79dfa672 Mon Sep 17 00:00:00 2001 From: Danilo F Marinho Date: Mon, 4 May 2026 10:55:31 -0300 Subject: [PATCH 2/6] Prepare Pesquisa Social fork for Firefox signing --- FORK-NOTICE.md | 20 +++++++++++++++++ build-xpi.ps1 | 52 ++++++++++++++++++++++++++++++++++++++++++++ manifest.json | 10 ++++----- popup/interface.html | 15 +++++-------- 4 files changed, 83 insertions(+), 14 deletions(-) create mode 100644 FORK-NOTICE.md create mode 100644 build-xpi.ps1 diff --git a/FORK-NOTICE.md b/FORK-NOTICE.md new file mode 100644 index 0000000..12f7ac8 --- /dev/null +++ b/FORK-NOTICE.md @@ -0,0 +1,20 @@ +# Pesquisa Social Fork Notice + +This project is a modified fork of Zeeschuimer. + +Original project: +- Zeeschuimer +- Copyright (c) Stijn Peeters +- Original repository: https://github.com/digitalmethodsinitiative/zeeschuimer + +This fork contains local modifications by Danilo F. Marinho and is distributed +with the obligations of the Mozilla Public License 2.0. + +MPL 2.0 compliance notes: +- Files derived from the original MPL-covered code remain under MPL 2.0. +- Original copyright and license notices must be preserved. +- If this extension is distributed in executable form, the corresponding source + code for the MPL-covered files must also be made available. + +Fork contact: +- danilo-f.marinho@hotmail.com diff --git a/build-xpi.ps1 b/build-xpi.ps1 new file mode 100644 index 0000000..e4f9114 --- /dev/null +++ b/build-xpi.ps1 @@ -0,0 +1,52 @@ +$manifest = Get-Content "manifest.json" | ConvertFrom-Json +$name = ($manifest.name -replace '[^a-zA-Z0-9_-]+', '-').ToLower().Trim('-') +$version = $manifest.version +$output = "$name-v$version.xpi" +$zipOutput = "$name-v$version.zip" +$staging = Join-Path $PWD ".build-xpi" + +if (Test-Path $staging) { + Remove-Item -LiteralPath $staging -Recurse -Force +} + +New-Item -ItemType Directory -Path $staging | Out-Null + +$exclude = @( + ".git", + ".build-xpi", + "tests", + "*.zip", + "*.xpi", + "*.DS_Store", + "__MACOSX", + "create-zip.sh", + "create-zip-bash.sh", + "build-xpi.ps1" +) + +Get-ChildItem -Force | Where-Object { + $item = $_ + -not ($exclude | Where-Object { + if ($_ -like "*`**" -or $_ -like "*?*") { + $item.Name -like $_ + } else { + $item.Name -eq $_ + } + }) +} | ForEach-Object { + Copy-Item -LiteralPath $_.FullName -Destination $staging -Recurse -Force +} + +if (Test-Path $output) { + Remove-Item -LiteralPath $output -Force +} + +if (Test-Path $zipOutput) { + Remove-Item -LiteralPath $zipOutput -Force +} + +Compress-Archive -Path (Join-Path $staging "*") -DestinationPath $zipOutput -Force +Move-Item -LiteralPath $zipOutput -Destination $output -Force +Remove-Item -LiteralPath $staging -Recurse -Force + +Write-Output "Created $output" diff --git a/manifest.json b/manifest.json index f5bb2d6..82e4a22 100644 --- a/manifest.json +++ b/manifest.json @@ -1,14 +1,14 @@ { - "description": "Collect data while browsing social media platforms and upload it for analysis later", + "description": "Capture social media data while browsing and export it for later analysis", "manifest_version": 2, - "name": "Zeeschuimer", + "name": "Pesquisa Social", "version": "1.13.6", - "homepage_url": "https://github.com/digitalmethodsinitiative/zeeschuimer", + "homepage_url": "mailto:danilo-f.marinho@hotmail.com", "browser_specific_settings": { "gecko": { - "update_url": "https://extensions.digitalmethods.net/updates.json", + "id": "pesquisa-social@local", "data_collection_permissions": { "required": ["none"] } @@ -22,7 +22,7 @@ "browser_action": { "default_icon": "images/zeeschuimer-64.png", - "default_title": "Zeeschuimer Status" + "default_title": "Pesquisa Social" }, "permissions": [ diff --git a/popup/interface.html b/popup/interface.html index 356f2b5..162ecb9 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -1,6 +1,6 @@ - Zeeschuimer + Pesquisa Social @@ -381,8 +381,8 @@
-

Zeeschuimer

- v1.13.6 +

Pesquisa Social

+ v1.13.6

Captured data objects

@@ -457,15 +457,12 @@

Import from NDJSON

- \ No newline at end of file + From 75e904044ad25baa9dfcf02281e40572b0d05e9d Mon Sep 17 00:00:00 2001 From: Danilo F Marinho Date: Mon, 4 May 2026 12:05:20 -0300 Subject: [PATCH 3/6] Add Mozilla source-package build flow --- .gitignore | 7 ++++ SOURCE-CODE-README.md | 56 ++++++++++++++++++++++++++++++++ build-source-zip.ps1 | 59 ++++++++++++++++++++++++++++++++++ build-xpi.ps1 | 75 +++++++++++++++++++++++++++---------------- 4 files changed, 170 insertions(+), 27 deletions(-) create mode 100644 SOURCE-CODE-README.md create mode 100644 build-source-zip.ps1 diff --git a/.gitignore b/.gitignore index 6cf9326..7796f16 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,10 @@ # logs geckodriver.log + +# Local build outputs +*.xpi +*.zip + +# Local captured exports +*.ndjson diff --git a/SOURCE-CODE-README.md b/SOURCE-CODE-README.md new file mode 100644 index 0000000..83c5918 --- /dev/null +++ b/SOURCE-CODE-README.md @@ -0,0 +1,56 @@ +# Source Code Package for Mozilla Review + +This archive contains the source code used to build the Firefox extension +"Pesquisa Social". + +## Extension identity + +- Add-on name: Pesquisa Social +- Add-on ID: pesquisa-social@local +- Fork contact: danilo-f.marinho@hotmail.com + +## Origin and license + +This project is a modified fork of Zeeschuimer. + +- Original project: https://github.com/digitalmethodsinitiative/zeeschuimer +- Original copyright: Stijn Peeters +- License for MPL-covered files: Mozilla Public License 2.0 + +See these files in the source package: + +- `LICENSE` +- `FORK-NOTICE.md` + +## Build environment + +- Operating system used to produce the submitted package: Windows +- Shell used: PowerShell +- Archive/build script: `build-xpi.ps1` +- Required software: PowerShell with .NET support for `System.IO.Compression` + +No Node.js bundling, transpilation, or webpack build is required for this +extension package. + +## How to reproduce the submitted XPI + +1. Extract this source code package. +2. Open PowerShell in the project root. +3. Run: + +```powershell +powershell -ExecutionPolicy Bypass -File .\build-xpi.ps1 +``` + +4. The script creates: + +```text +pesquisa-social-v1.13.6.xpi +``` + +## Notes for reviewers + +- The package includes third-party assets already present in the repository, + such as Font Awesome and font files. +- The submitted extension package excludes local data exports and other build + artifacts such as `.ndjson`, `.xpi`, and `.zip` files. diff --git a/build-source-zip.ps1 b/build-source-zip.ps1 new file mode 100644 index 0000000..009b2f1 --- /dev/null +++ b/build-source-zip.ps1 @@ -0,0 +1,59 @@ +Add-Type -AssemblyName System.IO.Compression +Add-Type -AssemblyName System.IO.Compression.FileSystem + +$output = "pesquisa-social-source-v1.13.6.zip" +$root = (Get-Location).Path + +$includePaths = @( + "manifest.json", + "popup", + "js", + "modules", + "images", + "fonts", + "inc", + "LICENSE", + "README.md", + "FORK-NOTICE.md", + "SOURCE-CODE-README.md", + "build-xpi.ps1" +) + +if (Test-Path $output) { + Remove-Item -LiteralPath $output -Force +} + +$zip = [System.IO.Compression.ZipFile]::Open($output, [System.IO.Compression.ZipArchiveMode]::Create) + +try { + foreach ($path in $includePaths) { + $fullPath = Join-Path $root $path + + if (Test-Path $fullPath -PathType Leaf) { + [System.IO.Compression.ZipFileExtensions]::CreateEntryFromFile( + $zip, + $fullPath, + ($path -replace '\\', '/'), + [System.IO.Compression.CompressionLevel]::Optimal + ) | Out-Null + continue + } + + if (Test-Path $fullPath -PathType Container) { + Get-ChildItem -LiteralPath $fullPath -Recurse -File | ForEach-Object { + $entryName = $_.FullName.Substring($root.Length).TrimStart('\') -replace '\\', '/' + [System.IO.Compression.ZipFileExtensions]::CreateEntryFromFile( + $zip, + $_.FullName, + $entryName, + [System.IO.Compression.CompressionLevel]::Optimal + ) | Out-Null + } + } + } +} +finally { + $zip.Dispose() +} + +Write-Output "Created $output" diff --git a/build-xpi.ps1 b/build-xpi.ps1 index e4f9114..82d3dba 100644 --- a/build-xpi.ps1 +++ b/build-xpi.ps1 @@ -2,51 +2,72 @@ $manifest = Get-Content "manifest.json" | ConvertFrom-Json $name = ($manifest.name -replace '[^a-zA-Z0-9_-]+', '-').ToLower().Trim('-') $version = $manifest.version $output = "$name-v$version.xpi" -$zipOutput = "$name-v$version.zip" -$staging = Join-Path $PWD ".build-xpi" -if (Test-Path $staging) { - Remove-Item -LiteralPath $staging -Recurse -Force -} +Add-Type -AssemblyName System.IO.Compression +Add-Type -AssemblyName System.IO.Compression.FileSystem -New-Item -ItemType Directory -Path $staging | Out-Null +$root = (Get-Location).Path -$exclude = @( +$excludeNames = @( ".git", + ".github", ".build-xpi", "tests", - "*.zip", - "*.xpi", - "*.DS_Store", - "__MACOSX", "create-zip.sh", "create-zip-bash.sh", "build-xpi.ps1" ) -Get-ChildItem -Force | Where-Object { - $item = $_ - -not ($exclude | Where-Object { - if ($_ -like "*`**" -or $_ -like "*?*") { - $item.Name -like $_ - } else { - $item.Name -eq $_ +$excludePatterns = @( + "*.zip", + "*.xpi", + "*.DS_Store", + "*.ndjson" +) + +function Should-Skip($item) { + foreach ($name in $excludeNames) { + if ($item.Name -eq $name) { + return $true + } + } + + foreach ($pattern in $excludePatterns) { + if ($item.Name -like $pattern) { + return $true } - }) -} | ForEach-Object { - Copy-Item -LiteralPath $_.FullName -Destination $staging -Recurse -Force + } + + return $false } if (Test-Path $output) { Remove-Item -LiteralPath $output -Force } -if (Test-Path $zipOutput) { - Remove-Item -LiteralPath $zipOutput -Force -} +$zip = [System.IO.Compression.ZipFile]::Open($output, [System.IO.Compression.ZipArchiveMode]::Create) + +try { + Get-ChildItem -LiteralPath $root -Recurse -File | ForEach-Object { + $file = $_ + + $segments = $file.FullName.Substring($root.Length).TrimStart('\').Split('\') + foreach ($segment in $segments) { + if ($excludeNames -contains $segment) { + return + } + } + + if (Should-Skip $file) { + return + } -Compress-Archive -Path (Join-Path $staging "*") -DestinationPath $zipOutput -Force -Move-Item -LiteralPath $zipOutput -Destination $output -Force -Remove-Item -LiteralPath $staging -Recurse -Force + $entryName = ($file.FullName.Substring($root.Length).TrimStart('\')) -replace '\\', '/' + [System.IO.Compression.ZipFileExtensions]::CreateEntryFromFile($zip, $file.FullName, $entryName, [System.IO.Compression.CompressionLevel]::Optimal) | Out-Null + } +} +finally { + $zip.Dispose() +} Write-Output "Created $output" From d1a3cd0b38b53ce8252fdeffa9085d181b345b64 Mon Sep 17 00:00:00 2001 From: Danilo F Marinho Date: Mon, 11 May 2026 16:51:17 -0300 Subject: [PATCH 4/6] Focus extension on TikTok, Instagram, and X --- README.md | 10 +--- SOURCE-CODE-README.md | 2 +- build-source-zip.ps1 | 2 +- manifest.json | 14 +---- modules/twitter-comments.js | 107 ++++++++++++++++++++++++++---------- popup/interface.html | 2 +- popup/interface.js | 8 +-- 7 files changed, 88 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 040a1b9..8de4d1e 100644 --- a/README.md +++ b/README.md @@ -15,18 +15,10 @@ be exported as a JSON file or exported to a [4CAT](https://github.com/digitalmet analysis and storage. Zeeschuimer is primarily intended as a companion to 4CAT, but you can also integrate its output into your own analysis pipeline. -Currently, it supports the following platforms: +This fork currently supports the following platforms: * [TikTok](https://www.tiktok.com) (posts and comments) * [Instagram](https://www.instagram.com) (posts, reels, and comments) * [X/Twitter](https://www.x.com) (posts and comments) -* [LinkedIn](https://www.linkedin.com) -* [9gag](https://9gag.com) -* [Imgur](https://imgur.com) -* [Douyin](https://douyin.com) -* [Gab](https://gab.com) -* [Truth Social](https://truth.social) -* [Pinterest](https://pinterest.com) -* [RedNote/Xiaohongshu](https://xiaohongshu.com) Platform support requires regular maintenance to keep up with changes to the platforms. If something does not work, we welcome issues and pull requests. See 'Limitations' below for some known limitations to data capture. diff --git a/SOURCE-CODE-README.md b/SOURCE-CODE-README.md index 83c5918..61f8ac0 100644 --- a/SOURCE-CODE-README.md +++ b/SOURCE-CODE-README.md @@ -45,7 +45,7 @@ powershell -ExecutionPolicy Bypass -File .\build-xpi.ps1 4. The script creates: ```text -pesquisa-social-v1.13.6.xpi +pesquisa-social-v1.13.7.xpi ``` ## Notes for reviewers diff --git a/build-source-zip.ps1 b/build-source-zip.ps1 index 009b2f1..e0360ec 100644 --- a/build-source-zip.ps1 +++ b/build-source-zip.ps1 @@ -1,7 +1,7 @@ Add-Type -AssemblyName System.IO.Compression Add-Type -AssemblyName System.IO.Compression.FileSystem -$output = "pesquisa-social-source-v1.13.6.zip" +$output = "pesquisa-social-source-v1.13.7.zip" $root = (Get-Location).Path $includePaths = @( diff --git a/manifest.json b/manifest.json index 82e4a22..59a89f5 100644 --- a/manifest.json +++ b/manifest.json @@ -3,7 +3,7 @@ "description": "Capture social media data while browsing and export it for later analysis", "manifest_version": 2, "name": "Pesquisa Social", - "version": "1.13.6", + "version": "1.13.7", "homepage_url": "mailto:danilo-f.marinho@hotmail.com", "browser_specific_settings": { @@ -42,18 +42,8 @@ "modules/tiktok-comments.js", "modules/instagram.js", "modules/instagram-comments.js", - "modules/linkedin.js", - "modules/9gag.js", - "modules/imgur.js", "modules/twitter.js", - "modules/twitter-comments.js", - "modules/douyin.js", - "modules/gab.js", - "modules/truth.js", - "modules/threads.js", - "modules/pinterest.js", - "modules/rednote.js", - "modules/rednote-comments.js" + "modules/twitter-comments.js" ] } } diff --git a/modules/twitter-comments.js b/modules/twitter-comments.js index 58705fd..cc7c77f 100644 --- a/modules/twitter-comments.js +++ b/modules/twitter-comments.js @@ -3,8 +3,21 @@ zeeschuimer.register_module( 'x.com', function (response, source_platform_url, source_url) { let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, ''); + const root_tweet_id_match = source_platform_url.match(/\/status\/(\d+)/); + const root_tweet_id = root_tweet_id_match ? root_tweet_id_match[1] : null; + const looks_like_tweet_request = source_url.indexOf('TweetDetail') >= 0 + || source_url.indexOf('tweetdetail') >= 0 + || source_url.indexOf('/graphql/') >= 0 + || source_url.indexOf('/i/api/') >= 0; + + if (!["x.com"].includes(domain)) { + return []; + } - if (!["x.com"].includes(domain) || source_url.indexOf('TweetDetail') < 0) { + // X frequently changes the exact GraphQL operation name used to load + // replies. When the user is on a /status/... page, be permissive about + // which request may contain the thread payload. + if (!root_tweet_id && !looks_like_tweet_request) { return []; } @@ -15,9 +28,8 @@ zeeschuimer.register_module( return []; } - const root_tweet_id_match = source_platform_url.match(/\/status\/(\d+)/); - const root_tweet_id = root_tweet_id_match ? root_tweet_id_match[1] : null; let comments = []; + let seen = new Set(); const normalise_tweet = function (tweet) { if (!tweet || tweet['__typename'] === 'TweetUnavailable') { @@ -43,24 +55,38 @@ zeeschuimer.register_module( const normalise_user = function (tweet) { const user = tweet['core'] && tweet['core']['user_results'] && tweet['core']['user_results']['result']; - if (!user) { - return null; + if (user) { + const core = user['core'] || {}; + const legacy = user['legacy'] || {}; + const avatar = user['avatar'] || {}; + + return { + id: user['rest_id'] || user['id'], + unique_id: core['screen_name'], + nickname: core['name'], + signature: legacy['description'], + avatar_thumb: avatar['image_url'] || legacy['profile_image_url_https'], + verified: !!(user['verification'] && user['verification']['verified']), + verified_type: user['verification'] ? user['verification']['verified_type'] : undefined, + follower_count: legacy['followers_count'], + following_count: legacy['friends_count'] + }; } - const core = user['core'] || {}; - const legacy = user['legacy'] || {}; - const avatar = user['avatar'] || {}; + const fallback_user = tweet['user'] || null; + if (!fallback_user) { + return null; + } return { - id: user['rest_id'] || user['id'], - unique_id: core['screen_name'], - nickname: core['name'], - signature: legacy['description'], - avatar_thumb: avatar['image_url'] || legacy['profile_image_url_https'], - verified: !!(user['verification'] && user['verification']['verified']), - verified_type: user['verification'] ? user['verification']['verified_type'] : undefined, - follower_count: legacy['followers_count'], - following_count: legacy['friends_count'] + id: fallback_user['id_str'] || fallback_user['id'], + unique_id: fallback_user['screen_name'], + nickname: fallback_user['name'], + signature: fallback_user['description'], + avatar_thumb: fallback_user['profile_image_url_https'], + verified: !!fallback_user['verified'], + follower_count: fallback_user['followers_count'], + following_count: fallback_user['friends_count'] }; }; @@ -85,6 +111,11 @@ zeeschuimer.register_module( } const tweet_id = String(tweet['id']); + if (seen.has(tweet_id)) { + return; + } + seen.add(tweet_id); + const parent_id = tweet['legacy']['in_reply_to_status_id_str'] || null; const post_id = root_tweet_id || tweet['legacy']['conversation_id_str'] || parent_id; const author = normalise_user(tweet); @@ -104,6 +135,14 @@ zeeschuimer.register_module( comments.push(tweet); }; + const add_from_item_content = function (item_content) { + if (!item_content || !item_content['tweet_results']) { + return; + } + + add_tweet(item_content['tweet_results']['result']); + }; + const traverse = function (obj) { for (let property in obj) { let child = obj[property]; @@ -125,24 +164,34 @@ zeeschuimer.register_module( } if ('itemContent' in entry['content']) { - const item_content = entry['content']['itemContent']; - if (!item_content['tweet_results']) { - continue; - } - - add_tweet(item_content['tweet_results']['result']); + add_from_item_content(entry['content']['itemContent']); + } else if (entry['content']['content'] && entry['content']['content']['itemContent']) { + add_from_item_content(entry['content']['content']['itemContent']); } else if ('__typename' in entry['content'] && entry['content']['__typename'] === 'TimelineTimelineModule') { for (const item of entry['content']['items']) { const item_content = item['item'] && item['item']['itemContent']; - if ( - !item_content - || !['Tweet', 'TimelineTweet'].includes(item_content['__typename']) - || !item_content['tweet_results'] - ) { + if (!item_content || !['Tweet', 'TimelineTweet'].includes(item_content['__typename'])) { continue; } - add_tweet(item_content['tweet_results']['result']); + add_from_item_content(item_content); + } + } else if (entry['entryId'] && data['globalObjects'] && data['globalObjects']['tweets']) { + let tweet_id = null; + if (entry['entryId'].indexOf('tweet-') === 0) { + tweet_id = entry['entryId'].split('-')[1]; + } else if (entry['entryId'].indexOf('sq-I-t-') === 0) { + tweet_id = entry['entryId'].split('-')[3]; + } + + if (tweet_id && data['globalObjects']['tweets'][tweet_id]) { + add_tweet({ + id: tweet_id, + legacy: data['globalObjects']['tweets'][tweet_id], + user: data['globalObjects']['users'] + ? data['globalObjects']['users'][data['globalObjects']['tweets'][tweet_id]['user_id_str']] + : null + }); } } } diff --git a/popup/interface.html b/popup/interface.html index 162ecb9..2c602c7 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -382,7 +382,7 @@

Pesquisa Social

- v1.13.6 + v1.13.7

Captured data objects

diff --git a/popup/interface.js b/popup/interface.js index 0fbf9b9..5e917b4 100644 --- a/popup/interface.js +++ b/popup/interface.js @@ -172,7 +172,7 @@ function update_icon() { } /** - * Get Zeeschuimer stats + * Get extension stats * * Loads the amount of items collected, etc. This function is called * periodically to keep the numbers in the interface updated as items are @@ -573,7 +573,7 @@ const upload_poll = { }, /** - * Add dataset to Zeeschuimer history + * Add dataset to upload history * * @param progress * @returns {Promise} @@ -611,7 +611,7 @@ async function get_blob(platform) { * Get a ZIP with all downloadable media for a platform. * * The ZIP contains a manifest.json file mapping each downloaded file back to - * the Zeeschuimer item and platform post it was extracted from. + * the stored item and platform post it was extracted from. * * @param platform * @param progress_callback @@ -1242,7 +1242,7 @@ document.addEventListener('DOMContentLoaded', async function () { const current_version = version_container.innerText; const known_version = await background.browser.storage.local.get('zs-version'); if(!known_version || current_version !== known_version['zs-version']) { - const version_alert = createElement('span', {'class': 'popup new-version'}, 'Zeeschuimer has been updated to a new version! You can read the release notes via this link.'); + const version_alert = createElement('span', {'class': 'popup new-version'}, 'Pesquisa Social has been updated to a new version! You can read the release notes via this link.'); const ok_button = createElement('button', {'class': 'close-popup'}, 'OK'); ok_button.addEventListener('click', async function(e) { await background.browser.storage.local.set({'zs-version': current_version}); From 3c9d1048a559b295a96d0e882e4fb7d2a32ac47e Mon Sep 17 00:00:00 2001 From: Danilo F Marinho Date: Mon, 11 May 2026 17:23:05 -0300 Subject: [PATCH 5/6] Fix X comments capture for status pages --- SOURCE-CODE-README.md | 2 +- build-source-zip.ps1 | 2 +- manifest.json | 2 +- modules/twitter-comments.js | 56 +++++++++++++++++++++++++++++++++++++ popup/interface.html | 2 +- 5 files changed, 60 insertions(+), 4 deletions(-) diff --git a/SOURCE-CODE-README.md b/SOURCE-CODE-README.md index 61f8ac0..4725e36 100644 --- a/SOURCE-CODE-README.md +++ b/SOURCE-CODE-README.md @@ -45,7 +45,7 @@ powershell -ExecutionPolicy Bypass -File .\build-xpi.ps1 4. The script creates: ```text -pesquisa-social-v1.13.7.xpi +pesquisa-social-v1.13.8.xpi ``` ## Notes for reviewers diff --git a/build-source-zip.ps1 b/build-source-zip.ps1 index e0360ec..1f4bcf7 100644 --- a/build-source-zip.ps1 +++ b/build-source-zip.ps1 @@ -1,7 +1,7 @@ Add-Type -AssemblyName System.IO.Compression Add-Type -AssemblyName System.IO.Compression.FileSystem -$output = "pesquisa-social-source-v1.13.7.zip" +$output = "pesquisa-social-source-v1.13.8.zip" $root = (Get-Location).Path $includePaths = @( diff --git a/manifest.json b/manifest.json index 59a89f5..954b63c 100644 --- a/manifest.json +++ b/manifest.json @@ -3,7 +3,7 @@ "description": "Capture social media data while browsing and export it for later analysis", "manifest_version": 2, "name": "Pesquisa Social", - "version": "1.13.7", + "version": "1.13.8", "homepage_url": "mailto:danilo-f.marinho@hotmail.com", "browser_specific_settings": { diff --git a/modules/twitter-comments.js b/modules/twitter-comments.js index cc7c77f..e3915ad 100644 --- a/modules/twitter-comments.js +++ b/modules/twitter-comments.js @@ -143,6 +143,61 @@ zeeschuimer.register_module( add_tweet(item_content['tweet_results']['result']); }; + const collect_tweet_candidates = function (obj) { + if (!obj || typeof obj !== "object") { + return; + } + + if (Array.isArray(obj)) { + for (const item of obj) { + collect_tweet_candidates(item); + } + return; + } + + // Direct tweet-like object + if ( + obj['legacy'] + && typeof obj['legacy'] === 'object' + && (obj['rest_id'] || obj['id'] || obj['legacy']['id_str']) + ) { + add_tweet(obj); + } + + // Common GraphQL wrapper + if (obj['tweet_results'] && obj['tweet_results']['result']) { + add_tweet(obj['tweet_results']['result']); + } + + // Some responses wrap the tweet one level deeper + if (obj['result'] && typeof obj['result'] === 'object') { + const result = obj['result']; + if ( + result['legacy'] + || result['tweet'] + || result['tweet_results'] + || result['__typename'] === 'Tweet' + || result['__typename'] === 'TweetWithVisibilityResults' + ) { + collect_tweet_candidates(result); + } + } + + if (obj['tweet'] && typeof obj['tweet'] === 'object') { + collect_tweet_candidates(obj['tweet']); + } + + for (let property in obj) { + if (!obj.hasOwnProperty(property) || !obj[property]) { + continue; + } + + if (typeof obj[property] === "object") { + collect_tweet_candidates(obj[property]); + } + } + }; + const traverse = function (obj) { for (let property in obj) { let child = obj[property]; @@ -202,6 +257,7 @@ zeeschuimer.register_module( }; traverse(data); + collect_tweet_candidates(data); return comments; }, 'twitter-comments' diff --git a/popup/interface.html b/popup/interface.html index 2c602c7..ce5e8cd 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -382,7 +382,7 @@

Pesquisa Social

- v1.13.7 + v1.13.8

Captured data objects

From 39eef25d2849f2fabe139c6f8585ac93a6c48940 Mon Sep 17 00:00:00 2001 From: Danilo F Marinho Date: Mon, 11 May 2026 18:27:02 -0300 Subject: [PATCH 6/6] Release v1.13.9 with X thread capture fixes --- SOURCE-CODE-README.md | 2 +- build-source-zip.ps1 | 4 ++- js/zs-background.js | 2 -- manifest.json | 2 +- modules/twitter.js | 68 ++++++++++++++++++++++++++++++++++++++----- 5 files changed, 66 insertions(+), 12 deletions(-) diff --git a/SOURCE-CODE-README.md b/SOURCE-CODE-README.md index 4725e36..9898995 100644 --- a/SOURCE-CODE-README.md +++ b/SOURCE-CODE-README.md @@ -45,7 +45,7 @@ powershell -ExecutionPolicy Bypass -File .\build-xpi.ps1 4. The script creates: ```text -pesquisa-social-v1.13.8.xpi +pesquisa-social-v1.13.9.xpi ``` ## Notes for reviewers diff --git a/build-source-zip.ps1 b/build-source-zip.ps1 index 1f4bcf7..059dfb2 100644 --- a/build-source-zip.ps1 +++ b/build-source-zip.ps1 @@ -1,7 +1,9 @@ Add-Type -AssemblyName System.IO.Compression Add-Type -AssemblyName System.IO.Compression.FileSystem -$output = "pesquisa-social-source-v1.13.8.zip" +$manifest = Get-Content "manifest.json" | ConvertFrom-Json +$version = $manifest.version +$output = "pesquisa-social-source-v$version.zip" $root = (Get-Location).Path $includePaths = @( diff --git a/js/zs-background.js b/js/zs-background.js index 10880f7..c7c7db9 100644 --- a/js/zs-background.js +++ b/js/zs-background.js @@ -344,8 +344,6 @@ window.zeeschuimer = { } }); })); - - return; } } }, diff --git a/manifest.json b/manifest.json index 954b63c..df60eaa 100644 --- a/manifest.json +++ b/manifest.json @@ -3,7 +3,7 @@ "description": "Capture social media data while browsing and export it for later analysis", "manifest_version": 2, "name": "Pesquisa Social", - "version": "1.13.8", + "version": "1.13.9", "homepage_url": "mailto:danilo-f.marinho@hotmail.com", "browser_specific_settings": { diff --git a/modules/twitter.js b/modules/twitter.js index 0a36282..66e1301 100644 --- a/modules/twitter.js +++ b/modules/twitter.js @@ -3,6 +3,8 @@ zeeschuimer.register_module( 'x.com', function (response, source_platform_url, source_url) { let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, ''); + const root_tweet_id_match = source_platform_url.match(/\/status\/(\d+)/); + const root_tweet_id = root_tweet_id_match ? root_tweet_id_match[1] : null; if ( !["x.com"].includes(domain) @@ -37,6 +39,51 @@ zeeschuimer.register_module( // One of the 'instructions' is to add entries to the timeline, this is what we are interested in because what // is added to the timeline are the tweets! // So find those instructions in the object, and reconstruct the tweets from there + const normalise_tweet = function (tweet) { + if (!tweet || tweet['__typename'] === 'TweetUnavailable') { + return null; + } + + if ('tweet' in tweet) { + tweet = tweet['tweet']; + } + + if (!tweet['legacy']) { + return null; + } + + tweet['id'] = tweet['legacy']['id_str'] || tweet['rest_id']; + return tweet['id'] ? tweet : null; + }; + + const should_include_as_post = function (tweet) { + tweet = normalise_tweet(tweet); + if (!tweet) { + return false; + } + + if (!root_tweet_id) { + return true; + } + + const tweet_id = String(tweet['id']); + const legacy = tweet['legacy'] || {}; + const conversation_id = legacy['conversation_id_str'] ? String(legacy['conversation_id_str']) : null; + const reply_to_id = legacy['in_reply_to_status_id_str'] ? String(legacy['in_reply_to_status_id_str']) : null; + + // On a single-tweet thread page, keep the root post in the posts + // stream and leave replies to the dedicated comments module. + if (tweet_id === root_tweet_id) { + return true; + } + + if (conversation_id === root_tweet_id || reply_to_id === root_tweet_id || !!reply_to_id) { + return false; + } + + return true; + }; + let traverse = function (obj) { for (let property in obj) { let child = obj[property]; @@ -69,11 +116,11 @@ zeeschuimer.register_module( continue; } - if('tweet' in tweet) { - // sometimes this is nested once more, for some reason - tweet = tweet['tweet']; + tweet = normalise_tweet(tweet); + if (!tweet || !should_include_as_post(tweet)) { + continue; } - tweet['id'] = tweet['legacy']['id_str']; + // distinguish tweets that were included because they were "promoted" from // those that are actually part of the user/home timeline or search result. // assume a tweet was promoted if itemContent has promotedMetadata @@ -87,7 +134,12 @@ zeeschuimer.register_module( }).map(item => { return item['item']['itemContent']['tweet_results']['result'] })) { - tweets.push({...reply_tweet, id: parseInt(reply_tweet['rest_id'])}); + const tweet = normalise_tweet(reply_tweet); + if (!tweet || !should_include_as_post(tweet)) { + continue; + } + + tweets.push(tweet); } } else { // in other cases this object only contains a reference to the full tweet, which is in turn @@ -116,7 +168,9 @@ zeeschuimer.register_module( // the user is also stored as a reference - so add the user data to the tweet tweet['user'] = data['globalObjects']['users'][tweet['legacy']['user_id_str']] - tweets.push(tweet); + if (should_include_as_post(tweet)) { + tweets.push(tweet); + } } } @@ -130,4 +184,4 @@ zeeschuimer.register_module( return tweets; }, 'twitter.com' -); \ No newline at end of file +);