Skip to content

Commit e325afe

Browse files
authored
Merge pull request #2654 from neddp/add_scheduled_metrics_cleanup_job
Add scheduled metrics cleanup job
2 parents 1a7246b + 54354d4 commit e325afe

7 files changed

Lines changed: 305 additions & 1 deletion

File tree

jobs/director/spec

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,12 @@ properties:
282282
description: Client certificate for mutual TLS connections to an external metrics server
283283
director.metrics_server.tls.private_key:
284284
description: Client private key for mutual TLS connections to an external metrics server
285+
director.metrics_server.file_retention_days:
286+
description: 'Number of days to retain metric binary files in /var/vcap/store/director/metrics. Older files are automatically cleaned up by a scheduled job. Set to 0 to disable cleanup.'
287+
default: 7
288+
director.metrics_server.cleanup_schedule:
289+
description: 'RufusScheduler cron formatted schedule for cleanup of stale metrics files'
290+
default: '0 0 0 * * * UTC' # once every day at midnight UTC
285291

286292
# NATs
287293
nats.address:

jobs/director/templates/director.yml.erb

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,16 @@ params['scheduled_jobs'] << {
162162
'schedule' => p('director.tasks_cleanup_schedule')
163163
}
164164

165+
if p('director.metrics_server.enabled')
166+
params['scheduled_jobs'] << {
167+
'command' => 'ScheduledMetricsCleanup',
168+
'schedule' => p('director.metrics_server.cleanup_schedule'),
169+
'params' => [{
170+
'retention_days' => p('director.metrics_server.file_retention_days')
171+
}]
172+
}
173+
end
174+
165175
params['record_events'] = p('director.events.record_events')
166176
if params['record_events']
167177
params['scheduled_jobs'] << {

src/bosh-director/bin/bosh-director

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ route_configuration = Bosh::Director::Api::RouteConfiguration.new(config)
2929
rack_app = Puma::Rack::Builder.app do
3030
use Rack::CommonLogger
3131
if config.metrics_server_enabled
32-
Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: '/var/vcap/store/director/metrics')
32+
Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: config.metrics_dir)
3333
use Bosh::Director::StripDeploymentsMiddlewareCollector
3434
use Prometheus::Middleware::Exporter
3535
end

src/bosh-director/lib/bosh/director.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ module Director
203203
require 'bosh/director/jobs/scheduled_dns_blobs_cleanup'
204204
require 'bosh/director/jobs/scheduled_dns_tombstone_cleanup'
205205
require 'bosh/director/jobs/scheduled_tasks_cleanup'
206+
require 'bosh/director/jobs/scheduled_metrics_cleanup'
206207
require 'bosh/director/jobs/create_snapshot'
207208
require 'bosh/director/jobs/snapshot_deployment'
208209
require 'bosh/director/jobs/snapshot_deployments'

src/bosh-director/lib/bosh/director/config.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,10 @@ def log_dir
265265
File.dirname(@log_file_path) if @log_file_path
266266
end
267267

268+
def metrics_dir
269+
File.join(@base_dir, 'metrics')
270+
end
271+
268272
def local_dns_enabled?
269273
!!@local_dns_enabled
270274
end
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
module Bosh::Director
2+
module Jobs
3+
class ScheduledMetricsCleanup < BaseJob
4+
@queue = :normal
5+
6+
def self.job_type
7+
:scheduled_metrics_cleanup
8+
end
9+
10+
def self.has_work(params)
11+
return false if params.first['retention_days'] <= 0
12+
13+
metrics_dir = Config.metrics_dir
14+
return false unless File.directory?(metrics_dir)
15+
16+
cutoff_time = time_days_ago(params.first['retention_days'])
17+
18+
# Check if there are any files older than retention period
19+
Dir.glob(File.join(metrics_dir, 'metric_*.bin')).any? do |file|
20+
File.mtime(file) < cutoff_time
21+
end
22+
end
23+
24+
def self.time_days_ago(days)
25+
Time.now - (days * 24 * 60 * 60)
26+
end
27+
28+
def self.schedule_message
29+
'clean up stale metrics files'
30+
end
31+
32+
def initialize(params = {}) # rubocop:disable Lint/MissingSuper
33+
@retention_days = params['retention_days']
34+
@metrics_dir = Config.metrics_dir
35+
end
36+
37+
def perform
38+
return 'Metrics cleanup disabled (retention_days is 0)' if @retention_days <= 0
39+
return "Metrics directory does not exist: #{@metrics_dir}" unless File.directory?(@metrics_dir)
40+
41+
cutoff_time = self.class.time_days_ago(@retention_days)
42+
logger.info("Started cleanup of metrics files older than #{cutoff_time} from #{@metrics_dir}")
43+
44+
files_to_delete = stale_files(cutoff_time)
45+
deleted_count, failed_count = delete_files(files_to_delete)
46+
47+
output = "Deleted #{deleted_count} metrics file(s) older than #{cutoff_time}."
48+
output << " Failed to delete #{failed_count} file(s)." if failed_count.positive?
49+
logger.info(output)
50+
output
51+
end
52+
53+
private
54+
55+
def stale_files(cutoff_time)
56+
Dir.glob(File.join(@metrics_dir, 'metric_*.bin')).select do |file|
57+
File.mtime(file) < cutoff_time
58+
end
59+
end
60+
61+
def delete_files(files)
62+
deleted_count = 0
63+
failed_count = 0
64+
65+
files.each do |file|
66+
File.delete(file)
67+
deleted_count += 1
68+
logger.debug("Deleted metrics file: #{file}")
69+
rescue StandardError => e
70+
failed_count += 1
71+
logger.warn("Failed to delete metrics file #{file}: #{e.message}")
72+
end
73+
74+
[deleted_count, failed_count]
75+
end
76+
end
77+
end
78+
end
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
require 'spec_helper'
2+
3+
module Bosh::Director
4+
describe Jobs::ScheduledMetricsCleanup do
5+
subject { described_class.new(*params) }
6+
let(:params) do
7+
[{
8+
'retention_days' => retention_days,
9+
}]
10+
end
11+
let(:retention_days) { 7 }
12+
let(:metrics_dir) { Dir.mktmpdir }
13+
let(:time) { Time.now }
14+
let(:seven_days_seconds) { 7 * 24 * 60 * 60 }
15+
let(:eight_days_ago) { time - seven_days_seconds - 86400 }
16+
let(:six_days_ago) { time - seven_days_seconds + 86400 }
17+
18+
before do
19+
allow(Config).to receive(:metrics_dir).and_return(metrics_dir)
20+
allow(Time).to receive(:now).and_return(time)
21+
end
22+
23+
after do
24+
FileUtils.rm_rf(metrics_dir) if File.directory?(metrics_dir)
25+
end
26+
27+
describe '.job_type' do
28+
it 'returns the job type' do
29+
expect(described_class.job_type).to eq(:scheduled_metrics_cleanup)
30+
end
31+
end
32+
33+
describe '.schedule_message' do
34+
it 'outputs a message' do
35+
expect(described_class.schedule_message).to eq('clean up stale metrics files')
36+
end
37+
end
38+
39+
describe '.time_days_ago' do
40+
it 'calculates time correctly' do
41+
expect(described_class.time_days_ago(7)).to eq(time - seven_days_seconds)
42+
end
43+
end
44+
45+
describe '.has_work' do
46+
context 'when retention_days is 0' do
47+
let(:retention_days) { 0 }
48+
49+
it 'returns false' do
50+
expect(described_class.has_work(params)).to eq(false)
51+
end
52+
end
53+
54+
context 'when metrics directory does not exist' do
55+
before do
56+
FileUtils.rm_rf(metrics_dir)
57+
end
58+
59+
it 'returns false' do
60+
expect(described_class.has_work(params)).to eq(false)
61+
end
62+
end
63+
64+
context 'when there are stale files' do
65+
before do
66+
old_file = File.join(metrics_dir, 'metric_old.bin')
67+
File.write(old_file, 'data')
68+
File.utime(eight_days_ago, eight_days_ago, old_file)
69+
end
70+
71+
it 'returns true' do
72+
expect(described_class.has_work(params)).to eq(true)
73+
end
74+
end
75+
76+
context 'when there are no stale files' do
77+
before do
78+
recent_file = File.join(metrics_dir, 'metric_recent.bin')
79+
File.write(recent_file, 'data')
80+
File.utime(six_days_ago, six_days_ago, recent_file)
81+
end
82+
83+
it 'returns false' do
84+
expect(described_class.has_work(params)).to eq(false)
85+
end
86+
end
87+
end
88+
89+
describe '#perform' do
90+
context 'when retention_days is 0' do
91+
let(:retention_days) { 0 }
92+
93+
it 'returns disabled message' do
94+
expect(subject.perform).to eq('Metrics cleanup disabled (retention_days is 0)')
95+
end
96+
end
97+
98+
context 'when metrics directory does not exist' do
99+
before do
100+
FileUtils.rm_rf(metrics_dir)
101+
end
102+
103+
it 'returns directory not exist message' do
104+
expect(subject.perform).to eq("Metrics directory does not exist: #{metrics_dir}")
105+
end
106+
end
107+
108+
context 'when there are files to clean up' do
109+
let!(:old_file_1) { File.join(metrics_dir, 'metric_old_1.bin') }
110+
let!(:old_file_2) { File.join(metrics_dir, 'metric_old_2.bin') }
111+
let!(:recent_file) { File.join(metrics_dir, 'metric_recent.bin') }
112+
let!(:other_file) { File.join(metrics_dir, 'other_file.txt') }
113+
114+
before do
115+
# Create old files (older than retention period)
116+
File.write(old_file_1, 'data1')
117+
File.utime(eight_days_ago, eight_days_ago, old_file_1)
118+
119+
File.write(old_file_2, 'data2')
120+
File.utime(eight_days_ago, eight_days_ago, old_file_2)
121+
122+
# Create recent file (within retention period)
123+
File.write(recent_file, 'data3')
124+
File.utime(six_days_ago, six_days_ago, recent_file)
125+
126+
# Create non-metric file (should not be deleted)
127+
File.write(other_file, 'other')
128+
File.utime(eight_days_ago, eight_days_ago, other_file)
129+
end
130+
131+
it 'deletes only old metric files' do
132+
subject.perform
133+
134+
expect(File.exist?(old_file_1)).to eq(false)
135+
expect(File.exist?(old_file_2)).to eq(false)
136+
expect(File.exist?(recent_file)).to eq(true)
137+
expect(File.exist?(other_file)).to eq(true)
138+
end
139+
140+
it 'returns success message with count' do
141+
cutoff_time = time - seven_days_seconds
142+
expect(subject.perform).to eq("Deleted 2 metrics file(s) older than #{cutoff_time}.")
143+
end
144+
145+
it 'logs the cleanup operation' do
146+
logger = double('logger', info: nil, debug: nil, warn: nil)
147+
allow(subject).to receive(:logger).and_return(logger)
148+
149+
subject.perform
150+
151+
expect(logger).to have_received(:info).at_least(:once)
152+
end
153+
end
154+
155+
context 'when file deletion fails' do
156+
let!(:protected_file) { File.join(metrics_dir, 'metric_protected.bin') }
157+
158+
before do
159+
File.write(protected_file, 'data')
160+
File.utime(eight_days_ago, eight_days_ago, protected_file)
161+
allow(File).to receive(:delete).with(protected_file).and_raise(Errno::EACCES, 'Permission denied')
162+
end
163+
164+
it 'logs warning and continues' do
165+
logger = double('logger', info: nil, debug: nil, warn: nil)
166+
allow(subject).to receive(:logger).and_return(logger)
167+
168+
result = subject.perform
169+
170+
expect(logger).to have_received(:warn).with(/Failed to delete metrics file/)
171+
expect(result).to match(/Failed to delete 1 file\(s\)/)
172+
end
173+
174+
it 'includes failure count in result message' do
175+
cutoff_time = time - seven_days_seconds
176+
result = subject.perform
177+
expect(result).to eq("Deleted 0 metrics file(s) older than #{cutoff_time}. Failed to delete 1 file(s).")
178+
end
179+
end
180+
181+
context 'when there are no files to clean up' do
182+
it 'returns message with zero count' do
183+
cutoff_time = time - seven_days_seconds
184+
expect(subject.perform).to eq("Deleted 0 metrics file(s) older than #{cutoff_time}.")
185+
end
186+
end
187+
188+
context 'with different retention periods' do
189+
let(:retention_days) { 30 }
190+
let(:thirty_one_days_ago) { time - (31 * 24 * 60 * 60) }
191+
let!(:very_old_file) { File.join(metrics_dir, 'metric_very_old.bin') }
192+
193+
before do
194+
File.write(very_old_file, 'data')
195+
File.utime(thirty_one_days_ago, thirty_one_days_ago, very_old_file)
196+
end
197+
198+
it 'respects the configured retention period' do
199+
subject.perform
200+
expect(File.exist?(very_old_file)).to eq(false)
201+
end
202+
end
203+
end
204+
end
205+
end

0 commit comments

Comments
 (0)