docs/src/events/lib/analyze-comment.ts at c86c157fc9998549fd084489464b0bc2aa0a6751 · github/docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import fs from 'fs'
import yaml from 'js-yaml'
import { cuss } from 'cuss'
import { cuss as cussPt } from 'cuss/pt'
import { cuss as cussFr } from 'cuss/fr'
import { cuss as cussEs } from 'cuss/es'
let language: any = null

async function getLanguageInstance() {
  if (!language) {
    const { Language } = await import('@horizon-rs/language-guesser')
    language = new Language()
  }
  return language
}

// Exported for the debugging CLI script
export const SIGNAL_RATINGS = [
  {
    reduction: 1.0,
    name: 'email-only',
    validator: (comment: string) => isEmailOnly(comment),
  },
  {
    reduction: 0.2,
    name: 'contains-email',
    validator: (comment: string) => isContainingEmail(comment),
  },
  {
    reduction: 1.0,
    name: 'url-only',
    validator: (comment: string) => isURL(comment),
  },
  {
    reduction: 1.0,
    name: 'numbers-only',
    validator: (comment: string) => isNumbersOnly(comment),
  },
  {
    reduction: 0.1,
    name: 'all-uppercase',
    validator: (comment: string) => isAllUppercase(comment),
  },
  {
    reduction: 0.5,
    name: 'single-word',
    validator: (comment: string) => isSingleWord(comment),
  },
  {
    reduction: 0.2,
    name: 'too-short',
    validator: (comment: string) => isTooShort(comment),
  },
  {
    reduction: 0.2,
    name: 'not-language',
    validator: async (comment: string, commentLanguage: string) =>
      await isNotLanguage(comment, commentLanguage),
  },
  {
    reduction: 0.3,
    name: 'cuss-words-likely',
    validator: (comment: string, commentLanguage: string) =>
      isLikelyCussWords(comment, commentLanguage),
  },
  {
    reduction: 0.1,
    name: 'cuss-words-maybe',
    validator: (comment: string, commentLanguage: string) =>
      isMaybeCussWords(comment, commentLanguage),
  },
  {
    reduction: 0.2,
    name: 'mostly-emoji',
    validator: (comment: string) => isMostlyEmoji(comment),
  },
  {
    reduction: 1.0,
    name: 'spammy-words',
    validator: (comment: string) => isSpammyWordList(comment),
  },
]

export async function getGuessedLanguage(comment: string) {
  if (!comment || !comment.trim()) {
    return
  }

  const lang = await getLanguageInstance()
  const bestGuess = lang.guessBest(comment.trim(), [])
  if (!bestGuess) return // Can happen if the text is just whitespace
  // // @horizon-rs/language-guesser is based on tri-grams and can lead
  // // to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
  // // Haitian! And that 'I wanne robux 1000' is Polish!
  // // But that's because they are short and there's not enough clues to
  // // guess what language it is. You and I might know those are actually
  // // attempts to be English, despite the spelling.
  // // But are they useful comments? Given that this is just a signal,
  // // and not a hard blocker, it's more of a clue than a fact.

  return bestGuess.alpha2 || undefined
}

export async function analyzeComment(text: string, commentLanguage = 'en') {
  const signals = []
  let rating = 1.0
  for (const { reduction, name, validator } of SIGNAL_RATINGS) {
    if (await validator(text, commentLanguage)) {
      signals.push(name)
      rating -= reduction
    }
    if (rating <= 0) break
  }

  return { signals, rating }
}

function isEmailOnly(text: string) {
  if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) {
    const atSigns = text.split('@').length
    if (atSigns === 2) {
      return true
    }
  }
}

function isContainingEmail(text: string) {
  if (text.includes('@') && !isEmailOnly(text)) {
    // Don't use splitWords() here because `foo@example.com` will be
    // split up into ['foo', 'example.com'].
    return text.split(/\s+/g).some((word) => isEmailOnly(word))
  }
  return false
}

function isURL(text: string) {
  if (!text.trim().includes(' ')) {
    if (URL.canParse(text.trim())) return true
  }
}

function isNumbersOnly(text: string) {
  return /^\d+$/.test(text.replace(/\s/g, ''))
}

function isAllUppercase(text: string) {
  return /[A-Z]/.test(text) && text === text.toUpperCase()
}

function isTooShort(text: string) {
  const split = text.trim().split(/\s+/)
  if (split.length <= 3) {
    return true
  }
}

function isSingleWord(text: string) {
  const whitespaceSplit = text.trim().split(/\s+/)
  // E.g. `this-has-no-whitespace` or `snap/hooks/install`
  return whitespaceSplit.length === 1
}

async function isNotLanguage(text: string, language_: string) {
  const lang = await getLanguageInstance()
  const bestGuess = lang.guessBest(text.trim(), [])
  if (!bestGuess) return true // Can happen if the text is just whitespace
  // @horizon-rs/language-guesser is based on tri-grams and can lead
  // to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
  // Haitian! And that 'I wanne robux 1000' is Polish!
  // But that's because they are short and there's not enough clues to
  // guess what language it is. You and I might know those are actually
  // attempts to be English, despite the spelling.
  // But are they useful comments? Given that this is just a signal,
  // and not a hard blocker, it's more of a clue than a fact.

  // We don't want to reduce the score for English comments. English
  // comments, when evaluated by language, are always valid.
  return bestGuess.alpha2 !== language_ && bestGuess.alpha2 !== 'en'
}

function isMostlyEmoji(text: string) {
  text = text.replace(/\s/g, '')
  const emojiRegex = /\p{Emoji}/gu
  const emojiMatches = text.match(emojiRegex)
  if (!emojiMatches) return false
  const emojiRatio = emojiMatches.length / text.length
  return emojiRatio > 0.25
}

function getCussWords(lang: string) {
  switch (lang) {
    case 'pt':
      return cussPt
    case 'fr':
      return cussFr
    case 'es':
      return cussEs
    default:
      return cuss
  }
}

function isLikelyCussWords(text: string, language_: string, rating = 2) {
  const cussWords = getCussWords(language_)
  const words = splitWords(text).map((word) => word.toLowerCase())
  for (const word of words) {
    if (cussWords[word] && cussWords[word] === rating) {
      return true
    }
  }
  return false
}

function isMaybeCussWords(text: string, language_: string) {
  return isLikelyCussWords(text, language_, 1)
}

const segmenter = new Intl.Segmenter([], { granularity: 'word' })

function splitWords(text: string) {
  const segmentedText = segmenter.segment(text)
  return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
}

const surveyYaml = yaml.load(fs.readFileSync('data/survey-words.yml', 'utf8')) as {
  words: string[]
}
const surveyWords = surveyYaml.words.map((word: string) => word.toLowerCase())

function isSpammyWordList(text: string) {
  const words = text.toLowerCase().split(/(\s+|\\n+)/g)
  // Currently, we're intentionally not checking for
  // survey words that are substrings of a comment word.
  return Boolean(words.some((word) => surveyWords.includes(word)))
}