Meridian.AI Train #753

Workflow file for this run

	name: MeridianFormer Hourly Training

	on:
	schedule:
	# Every hour, every day
	- cron: '0 * * * *'
	workflow_dispatch:
	inputs:
	force_seed:
	description: 'Nuke & re-seed HF repo with fresh model?'
	type: boolean
	default: false
	max_steps:
	description: 'Training steps per run (default: 200)'
	type: string
	default: '200'

	concurrency:
	group: meridian-train-${{ github.ref }}
	cancel-in-progress: false

	jobs:
	seed:
	name: "🔧 Nuke & Seed HF"
	if: github.event.inputs.force_seed == 'true'
	runs-on: ubuntu-latest
	environment: HuggingFace Hub
	steps:
	- uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install deps
	run: \|
	pip install huggingface_hub transformers torch safetensors sentencepiece

	- name: Nuke & Seed
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	PYTHONPATH: .
	run: python scripts/seed_hf_repo.py

	train:
	name: "🚀 Hourly Training Run"
	needs: seed
	if: \|
	always() &&
	(needs.seed.result == 'success' \|\| needs.seed.result == 'skipped')
	runs-on: ubuntu-latest
	timeout-minutes: 55 # Leave 5min buffer in the hour
	permissions:
	contents: write

	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Set up Python 3.11
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'
	cache: 'pip'

	- name: Cache Dependencies
	uses: actions/cache@v4
	id: cache-deps
	with:
	path: ~/.cache/pip
	key: ${{ runner.os }}-pip-meridian-${{ hashFiles('requirements.txt') }}
	restore-keys: \|
	${{ runner.os }}-pip-meridian-

	- name: Install Dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install -r requirements.txt
	pip install ruff black

	- name: Lint & Format
	run: \|
	black . --quiet
	ruff check . --fix --quiet

	- name: Pull Checkpoint from HuggingFace
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	python -c "
	from huggingface_hub import snapshot_download, list_repo_files
	import os, shutil

	repo_id = 'MeridianAlgo/FinAI-Lite'
	token = os.getenv('HF_TOKEN')

	try:
	files = list(list_repo_files(repo_id=repo_id, token=token))
	checkpoint_files = [f for f in files if f.startswith('checkpoint/')]

	if checkpoint_files:
	print(f'Found {len(checkpoint_files)} checkpoint files')
	temp_dir = './temp_download'
	snapshot_download(
	repo_id=repo_id,
	local_dir=temp_dir,
	token=token
	)
	if os.path.exists(os.path.join(temp_dir, 'checkpoint')):
	if os.path.exists('./checkpoint'):
	shutil.rmtree('./checkpoint')
	shutil.move(os.path.join(temp_dir, 'checkpoint'), './checkpoint')
	print('✓ Checkpoint pulled')

	for cf in ['model.safetensors', 'trainer_state.pt', 'config.json', 'ewc_state.pt']:
	path = os.path.join('./checkpoint', cf)
	if os.path.exists(path):
	size_mb = os.path.getsize(path) / (1024 * 1024)
	print(f' ✓ {cf} ({size_mb:.2f} MB)')

	if os.path.exists(temp_dir):
	shutil.rmtree(temp_dir)
	else:
	print('No checkpoint found — will start fresh')
	except Exception as e:
	print(f'Pull error: {e}')
	import traceback
	traceback.print_exc()
	print('Will start fresh.')
	"

	- name: Train
	id: training
	continue-on-error: true
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	COMET_API_KEY: ${{ secrets.COMET_API_KEY }}
	SMOKE_TEST: '0'
	MAX_STEPS: ${{ github.event.inputs.max_steps \|\| '200' }}
	TOTAL_STEPS: '100000'
	GRAD_ACCUM: '4'
	BATCH_SIZE: '2'
	BLOCK_SIZE: '512'
	USE_EWC: '1'
	GRADIENT_CHECKPOINTING: '1'
	PYTHONPATH: .
	PYTHONUNBUFFERED: '1'
	run: \|
	timeout 2700 python train.py \|\| echo "Training completed or timed out"

	- name: Upload Checkpoint to HuggingFace
	if: always()
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	python -c "
	from huggingface_hub import HfApi
	import os

	if not os.path.exists('./checkpoint'):
	print('No checkpoint found, skipping')
	exit(0)

	api = HfApi()
	print('Uploading to MeridianAlgo/FinAI-Lite...')
	try:
	api.upload_folder(
	folder_path='./checkpoint',
	repo_id='MeridianAlgo/FinAI-Lite',
	path_in_repo='checkpoint',
	commit_message='Hourly training update [skip ci]',
	token=os.getenv('HF_TOKEN')
	)
	print('✓ Upload successful')
	except Exception as e:
	print(f'✗ Upload failed: {e}')
	exit(1)
	"

	- name: Save Training Artifact
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: meridian-checkpoint-${{ github.run_number }}
	path: checkpoint/
	retention-days: 3

	- name: Sync Dataset State
	if: always()
	run: \|
	git config --local user.email "action@github.com"
	git config --local user.name "MeridianFormer Bot"
	git add dataset_state.json
	git diff --staged --quiet \|\| git commit -m "chore: sync dataset state [skip ci]"
	git pull --rebase -X theirs origin main \|\| (git rebase --abort && git pull --no-rebase origin main)
	git push origin main

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Meridian.AI Train #753

Workflow file

Meridian.AI Train #753

Uh oh!

Workflow file for this run