-
-
Notifications
You must be signed in to change notification settings - Fork 722
Expand file tree
/
Copy pathclassify.py
More file actions
185 lines (155 loc) · 5.16 KB
/
classify.py
File metadata and controls
185 lines (155 loc) · 5.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
from commoncode.fileutils import file_name
from commoncode.fileutils import file_base_name
def get_relative_path(root_path, path):
"""
Return a path relativefrom the posix 'path' relative to a
base path of `len_base_path` length where the base is a directory if
`base_is_dir` True or a file otherwise.
"""
return path[len(root_path):].lstrip('/')
LEGAL_STARTS_ENDS = (
'copying',
'copyright',
'copyrights',
'copyleft',
'notice',
'license',
'licenses',
'licence',
'licences',
'licensing',
'licencing',
'legal',
'eula',
'agreement',
'copyleft',
'patent',
'patents',
)
_MANIFEST_ENDS = {
'.about': 'ABOUT file',
'/bower.json': 'bower',
'.podspec': 'cocoapod',
'/composer.json': 'composer',
'/description': 'cran',
'/elm-package.json': 'elm',
'/+compact_manifest': 'freebsd',
'+manifest': 'freebsd',
'.gemspec': 'gem',
'/metadata': 'gem',
# the extracted metadata of a gem archive
'/metadata.gz-extract': 'gem',
'/build.gradle': 'gradle',
'/project.clj': 'clojure',
'.pom': 'maven',
'/pom.xml': 'maven',
'.cabal': 'haskell',
'/haxelib.json': 'haxe',
'/package.json': 'npm',
'.nuspec': 'nuget',
'.pod': 'perl',
'/meta.yml': 'perl',
'/dist.ini': 'perl',
'/pipfile': 'pypi',
'/setup.cfg': 'pypi',
'/setup.py': 'pypi',
'/PKG-INFO': 'pypi',
'/pyproject.toml': 'pypi',
'.spec': 'rpm',
'/cargo.toml': 'rust',
'.spdx': 'spdx',
'/dependencies': 'generic',
# note that these two cannot be top-level for now
'debian/copyright': 'deb',
'meta-inf/manifest.mf': 'maven',
# TODO: Maven also has sometimes a pom under META-INF/
# 'META-INF/manifest.mf': 'JAR and OSGI',
}
MANIFEST_ENDS = tuple(_MANIFEST_ENDS)
README_STARTS_ENDS = (
'readme',
)
# Community files are usually files used for FOSS project and community
# maintainence purposes. We want to detect these as in the context of
# licenses as these files don't have interesting license detections, or
# license detection issues are not important to review for these files.
# this is similar to `key` files, which also has a lot of community info
# but there the license declarations are extremely important as they have
# information on the primary (or even secondary) licenses for the package
COMMUNITY_FILES = (
'CHANGELOG',
'ROADMAP',
'CONTRIBUTING',
'CODE_OF_CONDUCT',
'AUTHORS',
'SECURITY',
'FUNDING',
)
def clean_underscore_dash(filename):
return filename.replace('_', '').replace('-', '')
def check_is_community_file(filename):
"""
Return True if the resource is a known community filename,
return False otherwise.
"""
community_files_cleaned = [
clean_underscore_dash(filename.lower())
for filename in COMMUNITY_FILES
]
name = clean_underscore_dash(filename.lower())
if any(
name.startswith(comm_name) or name.endswith(comm_name)
for comm_name in community_files_cleaned
):
return True
return False
def check_is_resource_community_file(resource):
"""
Return True if the `resource` is a community file.
"""
return check_is_community_file(resource.name) or check_is_community_file(resource.base_name)
def check_is_path_community_file(path):
"""
Return True if the file at `path` is a community file.
"""
name = file_name(path, force_posix=True)
base_name = file_base_name(path, force_posix=True)
return check_is_community_file(name) or check_is_community_file(base_name)
def check_resource_name_start_and_end(resource, STARTS_ENDS):
"""
Return True if `resource.name` or `resource.base_name` begins or ends with
an element of `STARTS_ENDS`.
"""
name = resource.name.lower()
base_name = resource.base_name.lower()
return (
name.startswith(STARTS_ENDS)
or name.endswith(STARTS_ENDS)
or base_name.startswith(STARTS_ENDS)
or base_name.endswith(STARTS_ENDS)
)
def set_classification_flags(resource,
_LEGAL=LEGAL_STARTS_ENDS,
_MANIF=MANIFEST_ENDS,
_README=README_STARTS_ENDS,
):
"""
Set classification flags on the `resource` Resource.
"""
path = resource.path.lower()
resource.is_legal = is_legal = check_resource_name_start_and_end(resource, _LEGAL)
resource.is_readme = is_readme = check_resource_name_start_and_end(resource, _README)
resource.is_community = check_is_resource_community_file(resource)
# FIXME: this will never be picked up as this is NOT available in a pre-scan plugin
has_package_data = bool(getattr(resource, 'package_data', False))
resource.is_manifest = is_manifest = path.endswith(_MANIF) or has_package_data
resource.is_key_file = (resource.is_top_level and (is_readme or is_legal or is_manifest))
return resource