Skip to content

Commit f43056f

Browse files
committed
refactor: lazy init dit classifier, prefer headless body for classification
1 parent d1c5b7c commit f43056f

1 file changed

Lines changed: 14 additions & 8 deletions

File tree

runner/runner.go

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -430,11 +430,13 @@ func New(options *Options) (*Runner, error) {
430430
}
431431

432432
runner.simHashes = gcache.New[uint64, struct{}](1000).ARC().Build()
433-
ditClassifier, err := dit.New()
434-
if err != nil {
435-
gologger.Warning().Msgf("Could not initialize page classifier: %s", err)
433+
if options.JSONOutput || options.CSVOutput || len(options.OutputFilterPageType) > 0 {
434+
ditClassifier, err := dit.New()
435+
if err != nil {
436+
gologger.Warning().Msgf("Could not initialize page classifier: %s", err)
437+
}
438+
runner.ditClassifier = ditClassifier
436439
}
437-
runner.ditClassifier = ditClassifier
438440

439441
if options.SecretFile != "" {
440442
authProviderOpts := &authprovider.AuthProviderOptions{
@@ -652,12 +654,16 @@ func (r *Runner) duplicate(result *Result) bool {
652654
return false
653655
}
654656

655-
func (r *Runner) classifyPage(body string, pHash uint64) map[string]interface{} {
656-
kb := map[string]any{"pHash": pHash}
657+
func (r *Runner) classifyPage(headlessBody, body string, pHash uint64) map[string]interface{} {
658+
kb := map[string]interface{}{"pHash": pHash}
657659
if r.ditClassifier == nil {
658660
return kb
659661
}
660-
result, err := r.ditClassifier.ExtractPageType(body)
662+
html := body
663+
if headlessBody != "" {
664+
html = headlessBody
665+
}
666+
result, err := r.ditClassifier.ExtractPageType(html)
661667
if err != nil {
662668
return kb
663669
}
@@ -2636,7 +2642,7 @@ retry:
26362642
ExtractRegex: extractRegex,
26372643
ScreenshotBytes: screenshotBytes,
26382644
HeadlessBody: headlessBody,
2639-
KnowledgeBase: r.classifyPage(respData, pHash),
2645+
KnowledgeBase: r.classifyPage(headlessBody, respData, pHash),
26402646
TechnologyDetails: technologyDetails,
26412647
Resolvers: resolvers,
26422648
RequestRaw: requestDump,

0 commit comments

Comments
 (0)