From 6b7218c9b2b718348d191f37dc56e4d3506d8fb2 Mon Sep 17 00:00:00 2001 From: dale-wahl <32108944+dale-wahl@users.noreply.github.com> Date: Thu, 7 May 2026 15:03:10 +0000 Subject: [PATCH] chore: sync map_item for instagram from 4CAT e80c18d82518c7c213e8fe057a5336d0c6c03820 --- modules/instagram.js | 363 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 362 insertions(+), 1 deletion(-) diff --git a/modules/instagram.js b/modules/instagram.js index f14e6ef..6e05285 100644 --- a/modules/instagram.js +++ b/modules/instagram.js @@ -500,4 +500,365 @@ function extractEmbeddedInstagramJSON(response) { } return datas; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/instagram/search_instagram.py) +export function map_item(item) { + const link = item.link ?? ''; + if ((item.product_type === 'ad') || (link && link.startsWith('https://www.facebook.com/ads/ig_redirect'))) { + throw new MapItemException('appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.'); + } + + const is_polaris_response = '__typename' in item && 'polaris' in item.__typename.toLowerCase(); + const is_graph_response = '__typename' in item && item.__typename !== 'XDTMediaDict'; + + if (is_polaris_response) { + return new MappedItem(parse_polaris_item(item)); + } else if (is_graph_response) { + return new MappedItem(parse_graph_item(item)); + } else { + return new MappedItem(parse_itemlist_item(item)); + } +} + +function parse_polaris_item(node) { + const partial_item = node._zs_partial ?? false; + const collected_at = new MissingMappedField(0); + const unix_at = new MissingMappedField(0); + const caption = 'caption' in node ? (node.caption && node.caption.text) : new MissingMappedField(''); + + const user = node.user; + const owner = node.owner ?? {}; + if (user && owner) { + if (owner.id === user.id) { + // Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user + } else if (user.username !== owner.username) { + throw new MapItemException('Unable to parse item: different user and owner'); + } + } + const is_verified = 'is_verified' in user ? user.is_verified : new MissingMappedField(false); + + // media type + const type_map = { XIGPolarisPhotoMedia: 'photo', XIGPolarisVideoMedia: 'video' }; + const media_type = type_map[node.__typename] ?? 'unknown'; + const num_media = node.__typename !== 'XIGPolarisCarouselMedia' ? 1 : node.carousel_media.length; + + // get media urls + const display_urls = node.display_uri ?? new MissingMappedField(''); + let missing_media = null; + let media_urls; + if ('video_versions' in node) { + media_urls = node.video_versions[0].url ?? new MissingMappedField(''); + } else { + media_urls = new MissingMappedField(''); + } + + const mapped_item = { + collected_from_url: normalize_url_encoding(node.__import_meta?.source_platform_url), + collected_from_view: node._zs_instagram_view ?? '', + partial_item: partial_item, + id: node.code, + timestamp: collected_at, + thread_id: node.code, + parent_id: node.code, + url: 'https://www.instagram.com/p/' + node.code, + body: caption, + + author_id: user.id ?? owner.id ?? new MissingMappedField(''), + author: user.username ?? owner.username ?? new MissingMappedField(''), + author_fullname: user.full_name ?? owner.full_name ?? new MissingMappedField(''), + verified: is_verified, + author_avatar_url: user.profile_pic_url ?? owner.profile_pic_url ?? new MissingMappedField(''), + + coauthors: new MissingMappedField(''), + coauthor_fullnames: new MissingMappedField(''), + coauthor_ids: new MissingMappedField(''), + + media_type: media_type, + num_media: num_media, + image_urls: display_urls, + media_urls: media_urls, + + hashtags: caption !== MissingMappedField ? caption.match(/#([^ +!@#$%^&*()_+{}:"|<>? +;' +,./`~]+)/g)?.join(',') : '', + usertags: new MissingMappedField(''), + play_count: node.play_count ?? new MissingMappedField(0), + + likes_hidden: new MissingMappedField(''), + num_likes: new MissingMappedField(0), + num_comments: new MissingMappedField(0), + + location_name: new MissingMappedField(''), + location_id: new MissingMappedField(''), + location_latlong: new MissingMappedField(''), + location_city: new MissingMappedField(''), + + unix_timestamp: unix_at, + missing_media: missing_media, + }; + + return mapped_item; +} + +function parse_graph_item(node) { + const caption = node.edge_media_to_caption?.edges[0]?.node.text ?? new MissingMappedField(''); + const num_media = node.__typename !== 'GraphSidecar' ? 1 : node.edge_sidecar_to_children.edges.length; + + let media_node; + if (node.__typename === 'GraphSidecar') { + media_node = node.edge_sidecar_to_children.edges[0].node; + } else { + media_node = node; + } + + let media_url; + if (media_node.__typename === 'GraphVideo') { + media_url = media_node.video_url; + } else if (media_node.__typename === 'GraphImage') { + const resources = media_node.display_resources ?? media_node.thumbnail_resources; + try { + media_url = resources.pop().src; + } catch (e) { + media_url = media_node.display_url ?? ''; + } + } else { + media_url = media_node.display_url; + } + + const type_map = { GraphSidecar: 'photo', GraphVideo: 'video' }; + let media_type; + if (node.__typename !== 'GraphSidecar') { + media_type = type_map[node.__typename] ?? 'unknown'; + } else { + const media_types = new Set(node.edge_sidecar_to_children.edges.map(s => s.node.__typename)); + media_type = media_types.size > 1 ? 'mixed' : type_map[media_types.values().next().value] ?? 'unknown'; + } + + let location = { name: '', latlong: '', city: '', location_id: '' }; + if (node.location) { + location.name = node.location.name; + location.location_id = node.location.pk; + location.latlong = node.location.lat && node.location.lng ? `${node.location.lat},${node.location.lng}` : ''; + location.city = node.location.city; + } + + const no_likes = Boolean(node.like_and_view_counts_disabled); + + const user = node.user; + const owner = node.owner; + if (user && owner) { + if (owner.id === user.id) { + // Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user + } else if (user.username !== owner.username) { + throw new MapItemException('Unable to parse item: different user and owner'); + } + } + + const play_count = node.view_count !== undefined ? node.view_count : node.play_count !== undefined ? node.play_count : new MissingMappedField(0); + + const mapped_item = { + id: node.shortcode, + post_source_domain: normalize_url_encoding(node.__import_meta?.source_platform_url), + collected_from_view: node._zs_instagram_view ?? '', + partial_item: node._zs_partial ?? '', + timestamp: new Date(node.taken_at_timestamp * 1000).toISOString(), + thread_id: node.shortcode, + parent_id: node.shortcode, + url: 'https://www.instagram.com/p/' + node.shortcode, + body: caption, + + author: user.username ?? owner.username ?? new MissingMappedField(''), + author_fullname: user.full_name ?? owner.full_name ?? new MissingMappedField(''), + is_verified: Boolean(user.is_verified), + author_avatar_url: user.profile_pic_url ?? owner.profile_pic_url ?? new MissingMappedField(''), + + coauthors: new MissingMappedField(''), + coauthor_fullnames: new MissingMappedField(''), + coauthor_ids: new MissingMappedField(''), + + media_type: media_type, + num_media: num_media, + image_urls: node.display_url, + media_urls: media_url, + + hashtags: caption !== MissingMappedField ? caption.match(/#([^ +!@#$%^&*()_+{}:"|<>? +;' +,./`~]+)/g)?.join(',') : '', + usertags: node.edge_media_to_tagged_user.edges.map(u => u.node.user.username).join(','), + play_count: play_count, + likes_hidden: no_likes ? 'yes' : 'no', + num_likes: no_likes ? new MissingMappedField(0) : node.edge_media_preview_like.count, + num_comments: node.edge_media_preview_comment?.count ?? 0, + + location_name: location.name, + location_id: location.location_id, + location_latlong: location.latlong, + location_city: location.city, + + unix_timestamp: node.taken_at_timestamp, + missing_media: null, + }; + + return mapped_item; +} + +function parse_itemlist_item(node) { + const partial_item = node._zs_partial ?? false; + const num_media = node.media_type !== 8 ? 1 : node.carousel_media.length; + const caption = 'caption' in node ? (node.caption && node.caption.text) : new MissingMappedField(''); + + let display_urls = []; + let media_urls = []; + let missing_media = null; + const type_map = { 1: 'photo', 2: 'video' }; + const media_types = new Set(); + + let media_nodes; + if (node.media_type === 8) { + media_nodes = node.carousel_media; + } else { + media_nodes = [node]; + } + + for (const media_node of media_nodes) { + if (media_node.media_type === 2) { + if ('image_versions2' in media_node) { + display_urls.push(media_node.image_versions2.candidates[0].url); + } else if ('video_versions' in media_node) { + display_urls.push(media_node.video_versions[0].url); + } else { + if (partial_item) { + // Known partial item + } else { + throw new MapItemException('Instagram item format change'); + } + } + + if ('video_versions' in media_node) { + media_urls.push(media_node.video_versions[0].url); + } else { + if (partial_item) { + // Known partial item + } else { + throw new MapItemException('Instagram item format change'); + } + } + } else if (media_node.media_type === 1 && media_node.image_versions2) { + const media_url = media_node.image_versions2.candidates[0].url; + display_urls.push(media_url); + media_urls.push(media_url); + } else { + missing_media = new MissingMappedField(''); + } + + media_types.add(type_map[media_node.media_type] ?? 'unknown'); + } + + const media_type = media_types.size > 1 ? 'mixed' : media_types.values().next().value; + + let num_comments; + if ('comment_count' in node) { + num_comments = node.comment_count; + } else if ('comments' in node && Array.isArray(node.comments)) { + num_comments = node.comments.length; + } else { + num_comments = -1; + } + + let location = { name: '', latlong: '', city: '', location_id: '' }; + if (node.location) { + location.name = node.location.name; + location.location_id = node.location.pk; + location.latlong = node.location.lat && node.location.lng ? `${node.location.lat},${node.location.lng}` : ''; + location.city = node.location.city; + } + + const user = node.user ?? {}; + const owner = node.owner ?? {}; + if (user && owner) { + if (owner.id === user.id) { + // Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user + } else if (user.username !== owner.username) { + throw new MapItemException('Unable to parse item: different user and owner'); + } + } + + let coauthors = []; + let coauthor_fullnames = []; + let coauthor_ids = []; + if (node.coauthor_producers) { + for (const coauthor_node of node.coauthor_producers) { + coauthors.push(coauthor_node.username ?? new MissingMappedField('')); + coauthor_fullnames.push(coauthor_node.full_name ?? new MissingMappedField('')); + coauthor_ids.push(coauthor_node.id); + } + } + coauthors = coauthors.join(','); + coauthor_fullnames = coauthor_fullnames.join(','); + + const no_likes = Boolean(node.like_and_view_counts_disabled); + + const play_count = node.view_count !== undefined ? node.view_count : node.play_count !== undefined ? node.play_count : new MissingMappedField(0); + + let usertags; + if ('usertags' in node) { + usertags = node.usertags.in.map(user => user.user.username).join(','); + } else { + usertags = ''; + } + + const collected_at = partial_item ? new MissingMappedField(0) : new Date(node.taken_at * 1000).toISOString(); + const unix_at = partial_item ? new MissingMappedField(0) : node.taken_at; + + const mapped_item = { + collected_from_url: normalize_url_encoding(node.__import_meta?.source_platform_url), + collected_from_view: node._zs_instagram_view ?? '', + partial_item: node._zs_partial ?? '', + id: node.code, + timestamp: collected_at, + thread_id: node.code, + parent_id: node.code, + url: 'https://www.instagram.com/p/' + node.code, + body: caption, + + author_id: user.id ?? owner.id ?? new MissingMappedField(''), + author: user.username ?? owner.username ?? new MissingMappedField(''), + author_fullname: user.full_name ?? owner.full_name ?? new MissingMappedField(''), + verified: Boolean(user.is_verified), + author_avatar_url: user.profile_pic_url ?? owner.profile_pic_url ?? new MissingMappedField(''), + + coauthors: coauthors, + coauthor_fullnames: coauthor_fullnames, + coauthor_ids: coauthor_ids.join(','), + + media_type: media_type, + num_media: num_media, + image_urls: display_urls.join(','), + media_urls: media_urls.join(','), + + hashtags: caption !== MissingMappedField ? caption.match(/#([^ +!@#$%^&*()_+{}:"|<>? +;' +,./`~]+)/g)?.join(',') : '', + usertags: usertags, + play_count: play_count, + likes_hidden: no_likes ? 'yes' : 'no', + num_likes: no_likes ? new MissingMappedField(0) : node.like_count, + num_comments: num_comments, + + location_name: location.name, + location_id: location.location_id, + location_latlong: location.latlong, + location_city: location.city, + + unix_timestamp: unix_at, + missing_media: missing_media, + }; + + return mapped_item; +} +// === end auto-generated ===