Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Actor images build from the monorepo root (dockerContextDir in .actor/actor.json).
# Keep the context lean: ship sources + manifests + the pnpm lockfile only.
node_modules
**/node_modules
.git
.turbo
**/dist
**/storage
apify_storage
crawlee_storage
*.log
.DS_Store
.idea
.vscode
2 changes: 2 additions & 0 deletions .github/workflows/on-pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ jobs:
name: Build & Test
if: (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'docs:'))
runs-on: ubuntu-22.04
timeout-minutes: 30

steps:
- name: Checkout repository
Expand All @@ -38,6 +39,7 @@ jobs:
lint:
name: Lint
runs-on: ubuntu-22.04
timeout-minutes: 30
steps:
- name: Checkout repository
uses: actions/checkout@v6
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/release-generic-actors.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ on:
type: boolean
required: false
default: true
sitemap-scraper:
description: apify/sitemap-extractor
type: boolean
required: false
default: true
build-channel:
description: Build channel
type: choice
Expand Down Expand Up @@ -95,6 +100,13 @@ jobs:
development-version: '0.0'
development-build-tag: development
should-build: ${{ github.event.inputs.camoufox-scraper }}
# Platform actor is apify/sitemap-extractor (name in .actor/actor.json)
- actor: sitemap-scraper
stable-version: '0.1'
stable-build-tag: latest
development-version: '0.0'
development-build-tag: development
should-build: ${{ github.event.inputs.sitemap-scraper }}
steps:
- uses: actions/checkout@v6

Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/test-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ jobs:
build_and_test:
name: Build & Test
runs-on: ubuntu-22.04
timeout-minutes: 60

steps:
- name: Cancel Workflow Action
Expand Down Expand Up @@ -40,6 +41,11 @@ jobs:
- name: Install pnpm and dependencies
uses: apify/actions/pnpm-install@v1.1.2

- name: Install browsers
run: |
pnpm exec puppeteer browsers install chrome
pnpm exec playwright install chromium
- name: Build
run: pnpm ci:build

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pids
.vscode
yarn.lock
.yarn
# npm locks are generated locally in actor dirs (the monorepo uses pnpm-lock.yaml)
tmp
jsconfig.json
types
Expand Down
6 changes: 3 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ Please provide steps to reproduce if you found a bug or ideally fork the reposit

Before you submit your pull request, consider the following guidelines:

- Search [GitHub](https://github.com/apify/apify-sdk-js/pulls) for an open or closed PR that relates to your submission. You don't want to duplicate effort.
- Search [GitHub](https://github.com/apify/actor-scraper/pulls) for an open or closed PR that relates to your submission. You don't want to duplicate effort.

- Fork the project and install NPM dependencies.
- Fork the project and install dependencies with pnpm.

- Run tests before you start working, to be sure they all pass and your setup is working correctly:

```sh
npm run test
pnpm test
```

- Be sure to **include appropriate test cases**. Tests help make it clear what the PR is fixing and also make sure the changes won't break over time.
Expand Down
16 changes: 8 additions & 8 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "apify-sdk-js",
"name": "actor-scraper",
"private": true,
"description": "Apify SDK monorepo",
"description": "Apify generic scrapers monorepo",
"keywords": [
"apify",
"headless",
Expand All @@ -23,12 +23,12 @@
"license": "Apache-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/apify/apify-ts"
"url": "git+https://github.com/apify/actor-scraper.git"
},
"bugs": {
"url": "https://github.com/apify/apify-ts/issues"
"url": "https://github.com/apify/actor-scraper/issues"
},
"homepage": "https://sdk.apify.com",
"homepage": "https://github.com/apify/actor-scraper",
"scripts": {
"prepare": "husky",
"prepublishOnly": "turbo run copy",
Expand Down Expand Up @@ -61,7 +61,7 @@
"@apify/tsconfig": "^0.1.2",
"@commitlint/config-conventional": "^20.0.0",
"@isaacs/brace-expansion": "^5.0.1",
"@playwright/browser-chromium": "^1.46.0",
"@playwright/browser-chromium": "^1.61.0",
"@types/content-type": "^1.1.8",
"@types/fs-extra": "^11.0.4",
"@types/node": "^24.0.0",
Expand All @@ -81,8 +81,8 @@
"oxfmt": "0.46.0",
"oxlint": "1.62.0",
"oxlint-tsgolint": "0.22.0",
"playwright": "^1.46.0",
"puppeteer": "^24.0.0",
"playwright": "^1.61.0",
"puppeteer": "^25.1.0",
"rimraf": "^6.0.1",
"tsx": "^4.16.5",
"turbo": "2.9.1",
Expand Down
2 changes: 2 additions & 0 deletions packages/actor-scraper/camoufox-scraper/.actor/actor.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"name": "camoufox-scraper",
"version": "0.1",
"buildTag": "latest",
"dockerContextDir": "../../../..",
"dockerfile": "../Dockerfile",
"storages": {
"dataset": {
"actorSpecification": 1,
Expand Down
2 changes: 2 additions & 0 deletions packages/actor-scraper/camoufox-scraper/.dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ dist

# installed files
node_modules

# local npm lock is a workspace-link lock; it breaks the isolated build (the platform builds without it)
45 changes: 27 additions & 18 deletions packages/actor-scraper/camoufox-scraper/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,31 +1,40 @@
# Build context is the monorepo root (set via dockerContextDir in .actor/actor.json),
# so the build can use the pnpm workspace lockfile for a deterministic install.
FROM apify/actor-node-playwright-camoufox:22 AS builder

COPY --chown=myuser package*.json ./
# The browser base runs as `myuser`; the build stage needs root for corepack + a writable workdir.
USER root
WORKDIR /app

RUN npm install --include=dev --audit=false
RUN corepack enable

COPY --chown=myuser . ./
# Browsers ship with the base image; never let an npm postinstall download them.
# puppeteer is pulled transitively via @crawlee/browser-pool even for non-puppeteer actors.
ENV PUPPETEER_SKIP_DOWNLOAD=true \
PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1

RUN npm run build
# Whole workspace (root .dockerignore keeps node_modules/.git/dist out of the context).
COPY . ./

FROM apify/actor-node-playwright-camoufox:22
# Deterministic install (frozen lockfile, honors minimumReleaseAge) of the actor + its workspace deps.
RUN pnpm install --frozen-lockfile --filter actor-camoufox-scraper...

COPY --from=builder --chown=myuser /home/myuser/dist ./dist
# Build the actor and its workspace dependency @apify/scraper-tools.
RUN pnpm --filter actor-camoufox-scraper... build

COPY --chown=myuser package*.json ./
# Self-contained production bundle; inject-workspace-packages copies the built
# @apify/scraper-tools (a runtime workspace dep) into node_modules instead of symlinking it.
RUN pnpm config set inject-workspace-packages true \
&& pnpm --filter actor-camoufox-scraper deploy --prod /deploy

RUN npm --quiet set progress=false \
&& npm install --omit=dev \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
FROM apify/actor-node-playwright-camoufox:22

COPY --chown=myuser . ./
# Base WORKDIR is /home/myuser and ships a template node_modules plus the Xvfb entrypoint.
# Drop the template's node_modules and overlay only the lean production bundle; the inherited
# ENTRYPOINT (xvfb-entrypoint.sh) still wraps CMD with Xvfb, and the bundled browser is reused.
RUN rm -rf node_modules
COPY --from=builder --chown=myuser /deploy ./

ENV APIFY_DISABLE_OUTDATED_WARNING=1

CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
CMD ["node", "dist/main.js"]
4 changes: 2 additions & 2 deletions packages/actor-scraper/camoufox-scraper/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
},
"repository": {
"type": "git",
"url": "https://github.com/apify/apify-sdk-js"
"url": "https://github.com/apify/actor-scraper"
},
"author": {
"name": "Apify Technologies",
"email": "support@apify.com",
"url": "https://apify.com"
},
"license": "Apache-2.0",
"homepage": "https://github.com/apify/apify-sdk-js"
"homepage": "https://github.com/apify/actor-scraper"
}
2 changes: 2 additions & 0 deletions packages/actor-scraper/cheerio-scraper/.actor/actor.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"name": "cheerio-scraper",
"version": "0.1",
"buildTag": "latest",
"dockerContextDir": "../../../..",
"dockerfile": "../Dockerfile",
"storages": {
"dataset": {
"actorSpecification": 1,
Expand Down
2 changes: 2 additions & 0 deletions packages/actor-scraper/cheerio-scraper/.dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ storage

# installed files
node_modules

# local npm lock is a workspace-link lock; it breaks the isolated build (the platform builds without it)
37 changes: 19 additions & 18 deletions packages/actor-scraper/cheerio-scraper/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@
# Build context is the monorepo root (set via dockerContextDir in .actor/actor.json),
# so the build can use the pnpm workspace lockfile for a deterministic install.
FROM apify/actor-node:22 AS builder

COPY package*.json ./
WORKDIR /app

RUN npm install --include=dev --audit=false
RUN corepack enable

# Whole workspace (root .dockerignore keeps node_modules/.git/dist out of the context).
COPY . ./

RUN npm run build
# Deterministic install (frozen lockfile, honors minimumReleaseAge) of the actor + its workspace deps.
RUN pnpm install --frozen-lockfile --filter actor-cheerio-scraper...

FROM apify/actor-node:22
# Build the actor and its workspace dependency @apify/scraper-tools.
RUN pnpm --filter actor-cheerio-scraper... build

COPY --from=builder /usr/src/app/dist ./dist
# Self-contained production bundle. inject-workspace-packages copies the built
# @apify/scraper-tools (a runtime workspace dep) into node_modules instead of symlinking it.
RUN pnpm config set inject-workspace-packages true \
&& pnpm --filter actor-cheerio-scraper deploy --prod /deploy

COPY package*.json ./
FROM apify/actor-node:22

RUN rm -rf node_modules \
&& npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
WORKDIR /usr/src/app

COPY . ./
# Replace the base image's template with only the lean production bundle.
RUN find /usr/src/app -mindepth 1 -delete
COPY --from=builder /deploy ./

ENV APIFY_DISABLE_OUTDATED_WARNING=1

CMD npm run start:prod --silent
CMD ["node", "dist/main.js"]
2 changes: 1 addition & 1 deletion packages/actor-scraper/cheerio-scraper/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ async function pageFunction(context) {
}
```

The code runs in [Node.js 16](https://nodejs.org/) and the function accepts a single argument, the `context` object, whose properties are listed below.
The code runs in [Node.js 22](https://nodejs.org/) and the function accepts a single argument, the `context` object, whose properties are listed below.

The return value of the page function is an object (or an array of objects) representing the data extracted from the web page. The return value must be stringify-able to JSON, i.e. it can only contain basic types and no circular references. If you prefer not to extract any data from the page and skip it in the clean results, simply return `null` or `undefined`.

Expand Down
4 changes: 2 additions & 2 deletions packages/actor-scraper/cheerio-scraper/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
},
"repository": {
"type": "git",
"url": "https://github.com/apify/apify-sdk-js"
"url": "https://github.com/apify/actor-scraper"
},
"author": {
"name": "Apify Technologies",
Expand All @@ -38,5 +38,5 @@
"Ondra Urban <ondra@apify.com>"
],
"license": "Apache-2.0",
"homepage": "https://github.com/apify/apify-sdk-js"
"homepage": "https://github.com/apify/actor-scraper"
}
2 changes: 2 additions & 0 deletions packages/actor-scraper/jsdom-scraper/.actor/actor.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"name": "jsdom-scraper",
"version": "0.1",
"buildTag": "latest",
"dockerContextDir": "../../../..",
"dockerfile": "../Dockerfile",
"storages": {
"dataset": {
"actorSpecification": 1,
Expand Down
2 changes: 2 additions & 0 deletions packages/actor-scraper/jsdom-scraper/.dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ storage

# installed files
node_modules

# local npm lock is a workspace-link lock; it breaks the isolated build (the platform builds without it)
38 changes: 21 additions & 17 deletions packages/actor-scraper/jsdom-scraper/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,29 +1,33 @@
# Build context is the monorepo root (set via dockerContextDir in .actor/actor.json),
# so the build can use the pnpm workspace lockfile for a deterministic install.
FROM apify/actor-node:22 AS builder

COPY package*.json ./
WORKDIR /app

RUN npm install --include=dev --audit=false
RUN corepack enable

# Whole workspace (root .dockerignore keeps node_modules/.git/dist out of the context).
COPY . ./

RUN npm run build
# Deterministic install (frozen lockfile, honors minimumReleaseAge) of the actor + its workspace deps.
RUN pnpm install --frozen-lockfile --filter actor-jsdom-scraper...

FROM apify/actor-node:22
# Build the actor and its workspace dependency @apify/scraper-tools.
RUN pnpm --filter actor-jsdom-scraper... build

# Self-contained production bundle; inject-workspace-packages copies the built
# @apify/scraper-tools (a runtime workspace dep) into node_modules instead of symlinking it.
RUN pnpm config set inject-workspace-packages true \
&& pnpm --filter actor-jsdom-scraper deploy --prod /deploy

COPY --from=builder /usr/src/app/dist ./dist
FROM apify/actor-node:22

COPY package*.json ./
WORKDIR /usr/src/app

RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Replace the base image's template with only the lean production bundle.
RUN find /usr/src/app -mindepth 1 -delete
COPY --from=builder /deploy ./

COPY . ./
ENV APIFY_DISABLE_OUTDATED_WARNING=1

CMD npm run start:prod --silent
CMD ["node", "dist/main.js"]
Loading
Loading