scancode-toolkit/src/textcode/gibberish.py at 2ebca1fab6b8a2d52ef50ad807e658a9d88adb9f · aboutcode-org/scancode-toolkit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/python
#
# From: https://raw.githubusercontent.com/yapus/gibberish/01637fe1fda827529ca76b8d6fee2de9100719f1/gibberish/gibberish.py
#
# 12Jun2017 Petr Janata - added srcfile and outfile
# 17Jun2107 Petr Janata - expanded set of accepted characters to include digits and hyphen
#
# whch is based off of:
# https://raw.githubusercontent.com/rrenaud/Gibberish-Detector/aa1d4e4555362b3dada97ebe6ecc23a84fc470fe/gib_detect_train.py
#

import math
import pickle
from pathlib import Path

data_dir = Path(__file__).parent / 'data' / 'gibberish'
model_path = data_dir / 'gib_model.pki'
big_file_path = data_dir / 'big.txt'
good_file_path = data_dir / 'good.txt'
bad_file_path = data_dir / 'bad.txt'

accepted_chars = 'abcdefghijklmnopqrstuvwxyz0123456789- '
pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])


class Gibberish(object):
    def __init__(self):
        if model_path.exists():
            self.load_persisted_model()
        else:
            self.train()

    def persist_model(self):
        with open(model_path, mode='wb') as f:
            pickle.dump(vars(self), f)

    def load_persisted_model(self):
        with open(model_path, mode='rb') as f:
            persisted_model = pickle.load(f)
            for key, value in persisted_model.items():
                setattr(self, key, value)

    def normalize(self, line):
        """ Return only the subset of chars from accepted_chars.
        This helps keep the  model relatively small by ignoring punctuation,
        infrequenty symbols, etc. """
        return [c.lower() for c in line if c.lower() in accepted_chars]

    def ngram(self, n, l):
        """ Return all n grams from l after normalizing """
        filtered = self.normalize(l)
        for start in range(0, len(filtered) - n + 1):
            yield ''.join(filtered[start:start + n])

    def avg_transition_prob(self, l, log_prob_mat):
        """ Return the average transition prob from l through log_prob_mat. """
        log_prob = 0.0
        transition_ct = 0
        for a, b in self.ngram(2, l):
            log_prob += log_prob_mat[pos[a]][pos[b]]
            transition_ct += 1
        # The exponentiation translates from log probs to probs.
        return math.exp(log_prob / (transition_ct or 1))

    def train(self, bigfile=big_file_path, goodfile=good_file_path,
              badfile=bad_file_path):
        """ Write a simple model as a pickle file """
        k = len(accepted_chars)
        # Assume we have seen 10 of each character pair.  This acts as a kind of
        # prior or smoothing factor.  This way, if we see a character transition
        # live that we've never observed in the past, we won't assume the entire
        # string has 0 probability.
        counts = [[10 for i in range(k)] for i in range(k)]

        # Count transitions from big text file, taken
        # from http://norvig.com/spell-correct.html
        for line in open(bigfile, encoding='utf-8'):
            for a, b in self.ngram(2, line):
                counts[pos[a]][pos[b]] += 1

        # Normalize the counts so that they become log probabilities.
        # We use log probabilities rather than straight probabilities to avoid
        # numeric underflow issues with long texts.
        # This contains a justification:
        # http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/
        for i, row in enumerate(counts):
            s = float(sum(row))
            for j in range(len(row)):
                row[j] = math.log(row[j] / s)

        # Find the probability of generating a few arbitrarily choosen good and
        # bad phrases.
        good_probs = [self.avg_transition_prob(l, counts) for l in open(goodfile, encoding='utf-8')]
        bad_probs = [self.avg_transition_prob(l, counts) for l in open(badfile, encoding='utf-8')]

        # Assert that we actually are capable of detecting the junk.
        assert min(good_probs) > max(bad_probs)

        # And pick a threshold halfway between the worst good and best bad inputs.
        thresh = (min(good_probs) + max(bad_probs)) / 2
        self.mat = counts
        self.thresh = thresh
        self.persist_model()

    def detect_gibberish(self, text):
        COPYRIGHT_INDICATORS = (
            'copyright', '(c)', 'c)', '©', '@copyright',
            'author:', 'commit', 'portions:', 'rights reserved',
            '(p)', 'trademark', 'intellectual property'
        )

        text_lower = text.lower()
        if any(indicator in text_lower for indicator in COPYRIGHT_INDICATORS):
            return False

        text_normalized = ''.join(self.normalize(text))
        return self.avg_transition_prob(text_normalized, self.mat) < self.thresh

    def percent_gibberish(self, text):
        text = ''.join(self.normalize(text))
        text = text.strip()
        words = text.split(' ')
        if len(words) == 0:
            return 0

        gibberish_count = 0
        for word in words:
            if self.detect_gibberish(word):
                gibberish_count += 1

        return float(gibberish_count) / float(len(words))

    def gibberish_pct(self, text):
        text = ''.join(self.normalize(text))
        return self.avg_transition_prob(text, self.mat)