diff --git a/etc/scripts/dataset_pipeline/annotate_composites.py b/etc/scripts/dataset_pipeline/annotate_composites.py new file mode 100644 index 0000000000..4f4de5df00 --- /dev/null +++ b/etc/scripts/dataset_pipeline/annotate_composites.py @@ -0,0 +1,164 @@ +# annotates composite (AND/OR) license rules with {{ }} required phrase markers +# uses scancode's license index to find license names in rule text +import re +from pathlib import Path +import click +from license_expression import Licensing +from licensedcode.models import Rule, load_licenses +from licensedcode.models import rules_data_dir as default_rules_data_dir +from licensedcode.required_phrases import add_required_phrase_to_rule + +MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL) +VERSION_SUFFIX_RE = re.compile(r'\s+v?\d[\d.]*(?:\s*(?:only|or[ -]later|\+))?$', re.IGNORECASE) + +# extra short forms used in rule text that the license index doesnt have +EXTRA_NAMES = { + 'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2'], + 'gpl-2.0-plus': ['GPL-2.0+', 'GPLv2+', 'GPL 2.0 or later'], + 'gpl-3.0': ['GPL-3.0', 'GPLv3', 'GPL 3.0', 'GPL version 3'], + 'gpl-3.0-plus': ['GPL-3.0+', 'GPLv3+', 'GPL 3.0 or later'], + 'lgpl-2.1': ['LGPL-2.1', 'LGPLv2.1', 'LGPL 2.1'], + 'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+'], + 'lgpl-3.0': ['LGPL-3.0', 'LGPLv3', 'LGPL 3.0'], + 'lgpl-3.0-plus': ['LGPL-3.0+', 'LGPLv3+'], + 'agpl-3.0': ['AGPL-3.0', 'AGPLv3', 'AGPL 3.0'], + 'agpl-3.0-plus': ['AGPL-3.0+', 'AGPLv3+'], + 'mpl-2.0': ['MPL-2.0', 'MPL 2.0'], + 'apache-2.0': ['Apache-2.0', 'Apache 2.0'], + 'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause'], + 'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause'], + 'mit': ['MIT License', 'MIT license', 'MIT'], + 'isc': ['ISC License', 'ISC license', 'ISC'], + 'artistic-2.0': ['Artistic-2.0', 'Artistic 2.0'], + 'epl-1.0': ['EPL-1.0', 'EPL 1.0'], + 'epl-2.0': ['EPL-2.0', 'EPL 2.0'], + 'cc-by-4.0': ['CC-BY-4.0', 'CC BY 4.0'], + 'unlicense': ['Unlicense'], +} + + +def strip_version_suffix(name): + """removing trailing version from a license name""" + result = VERSION_SUFFIX_RE.sub('', name).strip() + if len(result) < 10 or result == name: + return None + return result + + +def get_candidate_names(lic): + """collect names to search for.longest first""" + names = [] + if lic.name: + names.append(lic.name) + base = strip_version_suffix(lic.name) + if base: + names.append(base) + if lic.short_name and lic.short_name not in names: + names.append(lic.short_name) + if lic.spdx_license_key and lic.spdx_license_key not in names: + names.append(lic.spdx_license_key) + if lic.key not in names: + names.append(lic.key) + for e in EXTRA_NAMES.get(lic.key, []): + if e not in names: + names.append(e) + names.sort(key=len, reverse=True) + return names + + +def find_in_text(text, candidates): + """case insensitive search.returns matched span having original case""" + text_lower = text.lower() + for name in candidates: + if not name or len(name) < 3: + continue + pos = text_lower.find(name.lower()) + if pos != -1: + return text[pos:pos + len(name)] + return None + + +@click.command() +@click.option('--rules-dir', type=click.Path(exists=True), default=None) +@click.option('--expression-filter', default=None, + help='only process rules containing this in their expression') +@click.option('--limit', type=int, default=None) +@click.option('--dry-run', is_flag=True) +def main(rules_dir, expression_filter, limit, dry_run): + """annotate rules with required phrase markers""" + if not rules_dir: + repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules' + rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir + + rules_path = Path(rules_dir) + licenses_db = load_licenses() + licensing = Licensing(list(licenses_db.values())) + + processed = 0 + annotated = 0 + + for rf in sorted(rules_path.glob('*.RULE')): + if limit and processed >= limit: + break + stem = rf.stem + if '_or_' not in stem and '_and_' not in stem: + continue + try: + rule = Rule.from_file(rule_file=str(rf)) + except Exception: + continue + if not rule.license_expression: + continue + if getattr(rule, 'is_required_phrase', False): + continue + text = rule.text or '' + if MARKER_RE.search(text): + continue + if expression_filter and expression_filter not in rule.license_expression: + continue + + processed += 1 + try: + keys = licensing.license_keys(rule.license_expression, unique=True) + except Exception: + continue + if not keys: + continue + + # find each license name in the text + phrases = [] + found_all = True + for key in keys: + lic = licenses_db.get(key) + if not lic: + found_all = False + break + match = find_in_text(text, get_candidate_names(lic)) + if not match: + found_all = False + break + phrases.append(match) + + if not found_all: + continue + + added = False + for phrase in phrases: + if add_required_phrase_to_rule(rule=rule, required_phrase=phrase, + source='composite_annotation', dry_run=dry_run): + added = True + if added: + annotated += 1 + if dry_run: + click.echo(f' {rule.identifier}: {phrases}') + + click.echo(f'\ndone - {annotated}/{processed} annotated') + + +if __name__ == '__main__': + main() + +# commands: +# python etc/scripts/dataset_pipeline/annotate_composites.py --dry-run +# python etc/scripts/dataset_pipeline/annotate_composites.py --expression-filter apache --limit 20 +# python etc/scripts/dataset_pipeline/annotate_composites.py