Skip to content

Commit 93e0c1c

Browse files
committed
Created script to extract detected lic matched text
Signed-off-by: Chin Yeung Li <tli@nexb.com>
1 parent 9808981 commit 93e0c1c

4 files changed

Lines changed: 131 additions & 0 deletions

File tree

docs/source/utilitycode/utilities.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Other Utilities
1717
CPP Includes <utilitycode/cpp_includes>
1818
Debian Copyright Parser <utilitycode/debian-copyright-parser>
1919
Debian Install Path <utilitycode/debian-package-list>
20+
Extract Matched Text for Detected Licenses <utilitycode/extract_lic_matched_text>
2021
Extract Source Path from map <utilitycode/extract_source_path_from_maps>
2122
Inventory to JSON <utilitycode/i2j>
2223
JSON to XLSX <utilitycode/json2xlsx>
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
.. _extract_lic_matched_text:
2+
3+
==========================================
4+
Extract Matched Text for Detected Licenses
5+
==========================================
6+
7+
|div-page-outline|
8+
9+
.. contents:: :local:
10+
:depth: 7
11+
12+
13+
14+
Usage
15+
=====
16+
17+
.. code-block::
18+
19+
Usage: extract_detected_license_text [OPTIONS] INPUT OUTPUT
20+
21+
Take an SCTK JSON input that has '"--license-text": true' and pull out each
22+
detected license’s matched_text. Save each one into its own file, using the
23+
rule_identifier as the filename, in a directory.
24+
25+
Options:
26+
-h, --help Show this message and exit.
27+
28+
Example
29+
=======
30+
31+
.. code-block::
32+
33+
extract_detected_license_text ~/project/scans/scan_results.json ~/project/license_texts/
34+

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ console_scripts =
7373
cpp_includes = utilitycode.cpp_includes:cli
7474
debut-copyright-parser = utilitycode.debian_copyright_parser:cli
7575
debian-package-list = utilitycode.debian_package_installed_list:cli
76+
extract_detected_license_text = utilitycode.extract_lic_matched_text:cli
7677
extract_docker_extra_data = utilitycode.extract_docker_extra_data:cli
7778
extract_map = utilitycode.extract_source_path_from_maps:cli
7879
file-cat = utilitycode.file_cat:cli
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf8 -*-
3+
4+
# ============================================================================
5+
# Copyright (c) nexB Inc. http://www.nexb.com/ - All rights reserved.
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
# SPDX-License-Identifier: Apache-2.0
17+
# ============================================================================
18+
19+
import click
20+
import json
21+
import os
22+
import sys
23+
24+
25+
@click.command()
26+
@click.argument(
27+
"input",
28+
required=True,
29+
metavar="INPUT",
30+
type=click.Path(
31+
exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True
32+
),
33+
)
34+
@click.argument(
35+
"output",
36+
required=True,
37+
metavar="OUTPUT",
38+
type=click.Path(file_okay=False, dir_okay=True, writable=True, resolve_path=True),
39+
)
40+
@click.help_option("-h", "--help")
41+
def cli(input, output):
42+
"""
43+
Take an SCTK JSON input that has '"--license-text": true' and pull out
44+
each detected license’s matched_text. Save each one into its own file,
45+
using the rule_identifier as the filename, in a directory.
46+
"""
47+
if not input.endswith(".json"):
48+
print("The input has to be a SCTK produced .json file.")
49+
sys.exit(1)
50+
51+
with open(input, "r", encoding="utf-8") as f:
52+
data = json.load(f)
53+
54+
tool_name = data["headers"][0]["tool_name"]
55+
if tool_name != "scancode-toolkit":
56+
print("The input has to be a SCTK produced .json file.")
57+
sys.exit(1)
58+
59+
license_detections_data_list = data["license_detections"]
60+
61+
extracted_data_dict = extract_matched_text(license_detections_data_list)
62+
save_to_files(extracted_data_dict, output)
63+
64+
print(
65+
f"Extracted {len(extracted_data_dict)} matched_texts from {input} and saved them to {output}."
66+
)
67+
68+
69+
def extract_matched_text(license_detections_data_list):
70+
"""
71+
Extract the matched_text for each detected license from the
72+
license_detections data list. Use the rule_identifier as the key and
73+
the matched_text as the value in a dictionary.
74+
"""
75+
extracted_data = {}
76+
for license_detection in license_detections_data_list:
77+
reference_matches = license_detection["reference_matches"]
78+
for reference_match in reference_matches:
79+
rule_identifier = reference_match["rule_identifier"]
80+
matched_text = reference_match["matched_text"]
81+
extracted_data[rule_identifier] = matched_text
82+
return extracted_data
83+
84+
85+
def save_to_files(data_dict, output_dir):
86+
"""
87+
Save the extracted matched_text data to individual text files in the
88+
output directory.
89+
"""
90+
for matched_rule in data_dict:
91+
filename = f"{matched_rule}.txt"
92+
filepath = os.path.join(output_dir, filename)
93+
94+
with open(filepath, "w", encoding="utf-8") as f:
95+
f.write(str(data_dict[matched_rule]))

0 commit comments

Comments
 (0)