Skip to content

Commit b90dd22

Browse files
authored
Merge pull request #885 from juanjemdIos/master
parser dockerfile. Docs. Fixes #725
2 parents 6183e73 + d0a81f6 commit b90dd22

7 files changed

Lines changed: 460 additions & 134 deletions

File tree

docs/dockerfiledoc.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
The following metadata fields can be extracted from a Dockerfile.
2+
These fields are defined using Dockerfile `LABEL` instructions as described in the
3+
[Dockerfile reference](https://docs.docker.com/reference/dockerfile/) and are interpreted
4+
according to the OCI Image Specification, following the
5+
[mapping for OCI image annotations](https://github.com/opencontainers/image-spec/blob/main/annotations.md#pre-defined-annotation-keys).
6+
7+
| Software metadata category | SOMEF metadata JSON path | DOCKERFILE metadata file field |
8+
|-----------------------------|-----------------------------------------|------------------------------------|
9+
| authors | authors[i].result.value | org.opencontainers.image.authors *(1)* |
10+
| authors | authors[i].result.value | LABEL maintainer *(1)* |
11+
| code_repository | code_repository[i].result.value | org.opencontainers.image.url |
12+
| description | description[i].result.value | org.opencontainers.image.description |
13+
| documentation | documentation[i].result.value | org.opencontainers.image.documentation |
14+
| license | license[i].result.value | org.opencontainers.image.licenses |
15+
| name | name[i].result.value | org.opencontainers.image.ref.name |
16+
| owner | owner[i].result.value | org.opencontainers.image.vendor |
17+
| version | version[i].result.value | org.opencontainers.image.version |
18+
19+
20+
---
21+
22+
23+
*(1)*
24+
- Example:
25+
```
26+
LABEL maintainer="The Prometheus Authors <prometheus-developers@googlegroups.com>"
27+
LABEL org.opencontainers.image.authors="The Prometheus Authors" \
28+
```
29+
30+

docs/supported_metadata_files.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ SOMEF can extract metadata from a wide range of files commonly found in software
2424
| `*.gemspec` | Ruby | Manifest file serves as the package descriptor used in Ruby gem projects. | <div align="center">[🔍](./gemspec.md)</div>| [📄](https://guides.rubygems.org/specification-reference/)| |[Example](https://github.com/rubygems/rubygems/blob/master/bundler/bundler.gemspec) |
2525
| `cargo.toml` | Rust | Manifest file serves as the package descriptor used in Rust projects | <div align="center">[🔍](./cargo.md)</div> | [📄](https://doc.rust-lang.org/cargo/reference/manifest.html)| |[Example](https://github.com/rust-lang/cargo/blob/master/Cargo.toml) |
2626
| `*.cabal` | Haskell | Manifest file serving as the package descriptor for Haskell projects.| <div align="center">[🔍](./cabal.md)</div> | [📄](https://cabal.readthedocs.io/en/3.10/cabal-package.html)| |[Example](https://github.com/haskell/cabal/blob/master/Cabal/Cabal.cabal) |
27+
| `dockerfile` | Dockerfile | Build specification file for container images that can include software metadata via LABEL instructions (OCI specification).| <div align="center">[🔍](./dockerfiledoc.md)</div> | [📄](https://docs.docker.com/reference/dockerfile/)| |[Example](https://github.com/FairwindsOps/nova/blob/master/Dockerfile) |
28+
2729

2830
> **Note:** The general principles behind metadata mapping in SOMEF are based on the [CodeMeta crosswalk](https://github.com/codemeta/codemeta/blob/master/crosswalk.csv) and the [CodeMeta JSON-LD context](https://github.com/codemeta/codemeta/blob/master/codemeta.jsonld).
2931
> However, each supported file type may have specific characteristics and field interpretations.

src/somef/parser/dockerfile_parser.py

Lines changed: 212 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,46 +2,226 @@
22
import os
33
import re
44
from ..utils import constants
5+
from ..process_results import Result
6+
7+
def parse_dockerfile(file_path, metadata_result: Result, source):
8+
9+
print(f"Extracting properties from Dockerfile: {file_path}")
510

6-
def extract_dockerfile_maintainer(file_path):
7-
print(f"Extracting maintainers from Dockerfile: {file_path}")
8-
maintainers = []
9-
unique_maintainers = []
1011
try:
1112
with open(file_path, "rb") as file:
1213
raw_data = file.read()
1314

14-
try:
15-
content = raw_data.decode("utf-8")
16-
except UnicodeDecodeError:
17-
logging.warning(f"File {file_path} is not UTF-8 decodable. Skipping.")
18-
return maintainers
19-
20-
# not sure if should be better property author or a new property of maintainer
21-
oci_match = re.findall(
22-
constants.REGEXP_MAINTAINER_LABEL_OCI,
23-
content,
24-
re.IGNORECASE | re.MULTILINE
15+
content = raw_data.decode("utf-8")
16+
except (OSError, UnicodeDecodeError) as e:
17+
logging.warning(f"Could not process Dockerfile {file_path}: {e}")
18+
return None
19+
20+
# print(content)
21+
title_match = re.search(
22+
constants.REGEXP_DOCKER_TITLE,
23+
content,
24+
re.IGNORECASE
25+
)
26+
27+
if title_match:
28+
title = title_match.group(1).strip()
29+
if title:
30+
metadata_result.add_result(
31+
constants.CAT_NAME,
32+
{
33+
"value": title,
34+
"type": constants.STRING
35+
},
36+
1,
37+
constants.TECHNIQUE_CODE_CONFIG_PARSER,
38+
source
39+
)
40+
41+
description_match = re.search(
42+
constants.REGEXP_DOCKER_DESCRIPTION,
43+
content,
44+
re.IGNORECASE
45+
)
46+
47+
if description_match:
48+
description = description_match.group(1).strip()
49+
if description:
50+
metadata_result.add_result(
51+
constants.CAT_DESCRIPTION,
52+
{
53+
"value": description,
54+
"type": constants.STRING
55+
},
56+
1,
57+
constants.TECHNIQUE_CODE_CONFIG_PARSER,
58+
source
59+
)
60+
61+
licenses_match = re.search(constants.REGEXP_DOCKER_LICENSES, content, re.IGNORECASE)
62+
if licenses_match:
63+
license_info_spdx = detect_license_spdx(licenses_match.group(1).strip())
64+
65+
if license_info_spdx:
66+
license_data = {
67+
"value": licenses_match.group(1).strip(),
68+
"spdx_id": license_info_spdx.get('spdx_id'),
69+
"name": license_info_spdx.get('name'),
70+
"type": constants.LICENSE
71+
}
72+
else:
73+
license_data = {
74+
"value": licenses_match.group(1).strip(),
75+
"type": constants.LICENSE
76+
}
77+
metadata_result.add_result(
78+
constants.CAT_LICENSE,
79+
license_data,
80+
1,
81+
constants.TECHNIQUE_CODE_CONFIG_PARSER,
82+
source
83+
)
84+
85+
86+
# source_match = re.search(constants.REGEXP_DOCKER_SOURCE, content, re.IGNORECASE)
87+
# if source_match:
88+
# properties[constants.PROP_SOURCE] = source_match.group(1).strip()
89+
90+
url_match = re.search(constants.REGEXP_DOCKER_URL, content, re.IGNORECASE)
91+
if url_match:
92+
metadata_result.add_result(
93+
constants.CAT_CODE_REPOSITORY,
94+
{
95+
"value": url_match.group(1).strip(),
96+
"type": constants.URL
97+
},
98+
1,
99+
constants.TECHNIQUE_CODE_CONFIG_PARSER,
100+
source
101+
)
102+
103+
version_match = re.search(constants.REGEXP_DOCKER_VERSION, content, re.IGNORECASE)
104+
if version_match:
105+
metadata_result.add_result(
106+
constants.CAT_VERSION,
107+
{
108+
"value": version_match.group(1).strip(),
109+
"type": constants.RELEASE,
110+
"tag": version_match.group(1).strip()
111+
},
112+
1,
113+
constants.TECHNIQUE_CODE_CONFIG_PARSER,
114+
source
25115
)
26-
# LABEL maintainer free
27-
label_match = re.findall(
28-
constants.REGEXP_MAINTAINER_LABEL_FREE,
29-
content,
30-
re.IGNORECASE | re.MULTILINE
116+
117+
documentation_match = re.search(constants.REGEXP_DOCKER_DOCUMENTATION, content, re.IGNORECASE)
118+
if documentation_match:
119+
metadata_result.add_result(
120+
constants.CAT_DOCUMENTATION,
121+
{
122+
"value": documentation_match.group(1).strip(),
123+
"type": constants.STRING
124+
},
125+
1,
126+
constants.TECHNIQUE_CODE_CONFIG_PARSER,
127+
source
31128
)
32-
# Deprecated maintainer
33-
maintainer_match = re.findall(
34-
constants.REGEXP_MAINTAINER,
35-
content,
36-
re.IGNORECASE | re.MULTILINE
129+
130+
131+
vendor_match = re.search(
132+
constants.REGEXP_DOCKER_VENDOR,
133+
content,
134+
re.IGNORECASE
135+
)
136+
137+
if vendor_match:
138+
vendor = vendor_match.group(1).strip()
139+
if vendor:
140+
if vendor and re.search(constants.REGEXP_LTD_INC, vendor, re.IGNORECASE):
141+
type_vendor = "Organization"
142+
else:
143+
type_vendor = "Person"
144+
145+
metadata_result.add_result(
146+
constants.CAT_OWNER,
147+
{
148+
"value": vendor,
149+
"type": type_vendor
150+
},
151+
1,
152+
constants.TECHNIQUE_CODE_CONFIG_PARSER,
153+
source
154+
)
155+
156+
# Extract maintainers
157+
maintainers = []
158+
unique_maintainers = []
159+
160+
maintainer_oci_match = re.findall(
161+
constants.REGEXP_MAINTAINER_LABEL_OCI,
162+
content,
163+
re.IGNORECASE | re.MULTILINE
164+
)
165+
# LABEL maintainer free
166+
maintanainer_label_match = re.findall(
167+
constants.REGEXP_MAINTAINER_LABEL_FREE,
168+
content,
169+
re.IGNORECASE | re.MULTILINE
170+
)
171+
# Deprecated maintainer
172+
maintainer_match = re.findall(
173+
constants.REGEXP_MAINTAINER,
174+
content,
175+
re.IGNORECASE | re.MULTILINE
176+
)
177+
maintainers.extend(maintainer_oci_match)
178+
maintainers.extend(maintanainer_label_match)
179+
maintainers.extend(maintainer_match)
180+
181+
unique_maintainers = list({m.strip() for m in maintainers if m.strip()})
182+
183+
for maintainer in unique_maintainers:
184+
metadata_result.add_result(
185+
constants.CAT_AUTHORS,
186+
{
187+
"type": constants.AGENT,
188+
"value": maintainer
189+
},
190+
1,
191+
constants.TECHNIQUE_CODE_CONFIG_PARSER,
192+
source
37193
)
38194

39-
maintainers.extend(oci_match)
40-
maintainers.extend(label_match)
41-
maintainers.extend(maintainer_match)
195+
return metadata_result
196+
197+
def detect_license_spdx(license_text):
198+
"""
199+
Function that given a license text, infers the name and spdx id in a dockerfile
200+
Parameters
201+
----------
202+
license_text
203+
204+
Returns
205+
-------
206+
A JSON dictionary with name and spdx id
207+
"""
208+
print("Detecting license from text:", license_text)
209+
for license_name, license_info in constants.LICENSES_DICT.items():
210+
if re.search(license_info["regex"], license_text, re.IGNORECASE):
211+
return {
212+
"name": license_name,
213+
"spdx_id": f"{license_info['spdx_id']}",
214+
"@id": f"https://spdx.org/licenses/{license_info['spdx_id']}"
215+
}
42216

43-
unique_maintainers = list({m.strip() for m in maintainers if m.strip()})
44-
except OSError:
45-
logging.warning(f"Could not read Dockerfile {file_path}")
217+
for license_name, license_info in constants.LICENSES_DICT.items():
218+
spdx_id = license_info["spdx_id"]
219+
if re.search(rf'\b{re.escape(spdx_id)}\b', license_text, re.IGNORECASE):
220+
return {
221+
"name": license_name,
222+
"spdx_id": spdx_id,
223+
"@id": f"https://spdx.org/licenses/{spdx_id}"
224+
}
225+
return None
46226

47-
return unique_maintainers
227+

src/somef/process_files.py

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from .parser.description_parser import parse_description_file
2222
from .parser.toml_parser import parse_toml_file
2323
from .parser.cabal_parser import parse_cabal_file
24-
from .parser.dockerfile_parser import extract_dockerfile_maintainer
24+
from .parser.dockerfile_parser import parse_dockerfile
2525
from chardet import detect
2626

2727

@@ -76,20 +76,22 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
7676
if filename == "Dockerfile" or filename.lower() == "docker-compose.yml":
7777
docker_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir,
7878
repo_relative_path, filename)
79-
if filename == "Dockerfile":
80-
format_file = constants.FORMAT_DOCKERFILE
81-
maintainers = extract_dockerfile_maintainer(os.path.join(repo_dir, file_path))
82-
else:
83-
format_file = constants.FORMAT_DOCKER_COMPOSE
84-
maintainers = None
79+
80+
# full_path = os.path.join(repo_dir, file_path)
8581

8682
result_value = {
8783
constants.PROP_VALUE: docker_url,
8884
constants.PROP_TYPE: constants.URL,
89-
constants.PROP_FORMAT: format_file
9085
}
91-
if maintainers:
92-
result_value[constants.PROP_AUTHOR] = maintainers
86+
87+
if filename == "Dockerfile":
88+
format_file = constants.FORMAT_DOCKERFILE
89+
result_value[constants.PROP_FORMAT] = format_file
90+
metadata_result = parse_dockerfile(os.path.join(dir_path, filename), metadata_result, docker_url)
91+
else:
92+
format_file = constants.FORMAT_DOCKER_COMPOSE
93+
94+
result_value[constants.PROP_FORMAT] = format_file
9395

9496
metadata_result.add_result(
9597
constants.CAT_HAS_BUILD_FILE,
@@ -98,14 +100,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
98100
constants.TECHNIQUE_FILE_EXPLORATION,
99101
docker_url
100102
)
101-
# metadata_result.add_result(constants.CAT_HAS_BUILD_FILE,
102-
# {
103-
# constants.PROP_VALUE: docker_url,
104-
# constants.PROP_TYPE: constants.URL,
105-
# constants.PROP_FORMAT: format_file
106-
# },
107-
# 1,
108-
# constants.TECHNIQUE_FILE_EXPLORATION, docker_url)
103+
109104
if filename.lower().endswith(".ipynb"):
110105
notebook_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir,
111106
repo_relative_path, filename)

0 commit comments

Comments
 (0)