Merge pull request #884 from juanjemdIos/master

dgarijo · web-flow · commit 6183e73b2ee5 · 2026-01-12T11:20:49.000+01:00
author or maintainer of dockerfile. Fixes #725
diff --git a/src/somef/parser/dockerfile_parser.py b/src/somef/parser/dockerfile_parser.py
@@ -0,0 +1,47 @@
+import logging
+import os
+import re
+from ..utils import constants
+
+def extract_dockerfile_maintainer(file_path):
+    print(f"Extracting maintainers from Dockerfile: {file_path}")
+    maintainers = []
+    unique_maintainers = [] 
+    try:
+        with open(file_path, "rb") as file:
+            raw_data = file.read()
+
+        try:
+            content = raw_data.decode("utf-8")
+        except UnicodeDecodeError:
+            logging.warning(f"File {file_path} is not UTF-8 decodable. Skipping.")
+            return maintainers
+
+        # not sure if should be better property author or a new property of maintainer
+        oci_match = re.findall(
+            constants.REGEXP_MAINTAINER_LABEL_OCI,
+            content,
+            re.IGNORECASE | re.MULTILINE
+        )
+        # LABEL maintainer free
+        label_match = re.findall(
+            constants.REGEXP_MAINTAINER_LABEL_FREE,
+            content,
+            re.IGNORECASE | re.MULTILINE
+        )
+        # Deprecated maintainer
+        maintainer_match = re.findall(
+            constants.REGEXP_MAINTAINER,
+            content,
+            re.IGNORECASE | re.MULTILINE
+        )
+
+        maintainers.extend(oci_match)
+        maintainers.extend(label_match)
+        maintainers.extend(maintainer_match)
+
+        unique_maintainers = list({m.strip() for m in maintainers if m.strip()})
+    except OSError:
+        logging.warning(f"Could not read Dockerfile {file_path}")
+
+    return unique_maintainers
diff --git a/src/somef/process_files.py b/src/somef/process_files.py
@@ -21,6 +21,7 @@
 from .parser.description_parser import parse_description_file
 from .parser.toml_parser import parse_toml_file
 from .parser.cabal_parser import parse_cabal_file
+from .parser.dockerfile_parser import extract_dockerfile_maintainer
 from chardet import detect
 
 
@@ -77,16 +78,34 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
                                                repo_relative_path, filename)
                     if filename == "Dockerfile":
                         format_file = constants.FORMAT_DOCKERFILE
+                        maintainers = extract_dockerfile_maintainer(os.path.join(repo_dir, file_path))
                     else:
                         format_file = constants.FORMAT_DOCKER_COMPOSE
-                    metadata_result.add_result(constants.CAT_HAS_BUILD_FILE,
-                                               {
-                                                   constants.PROP_VALUE: docker_url,
-                                                   constants.PROP_TYPE: constants.URL,
-                                                   constants.PROP_FORMAT: format_file
-                                               },
-                                               1,
-                                               constants.TECHNIQUE_FILE_EXPLORATION, docker_url)
+                        maintainers = None
+
+                    result_value = {
+                        constants.PROP_VALUE: docker_url,
+                        constants.PROP_TYPE: constants.URL,
+                        constants.PROP_FORMAT: format_file
+                    }
+                    if maintainers:
+                        result_value[constants.PROP_AUTHOR] = maintainers
+
+                    metadata_result.add_result(
+                        constants.CAT_HAS_BUILD_FILE,
+                        result_value,
+                        1,
+                        constants.TECHNIQUE_FILE_EXPLORATION,
+                        docker_url
+                    )
+                    # metadata_result.add_result(constants.CAT_HAS_BUILD_FILE,
+                    #                            {
+                    #                                constants.PROP_VALUE: docker_url,
+                    #                                constants.PROP_TYPE: constants.URL,
+                    #                                constants.PROP_FORMAT: format_file
+                    #                            },
+                    #                            1,
+                    #                            constants.TECHNIQUE_FILE_EXPLORATION, docker_url)
                 if filename.lower().endswith(".ipynb"):
                     notebook_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir,
                                                  repo_relative_path, filename)
@@ -652,50 +671,3 @@ def clean_text(text):
             cleaned_lines.append(line)
     return "\n".join(cleaned_lines)
 
-#     """
-#     Proccess a text with possible authors
-#     """
-#     if not author_str:
-#         return []
-
-#     authors = []
-
-#     for line in author_str.splitlines():
-#         line = line.strip()
-#         if not line or line.startswith("#"):
-#             continue  
-
-#         email_match = re.search(r'<([^>]+)>', line)
-#         if email_match:
-#             email = email_match.group(1)
-#             name = line[:email_match.start()].strip()
-#         else:
-#             name = line
-#             email = None
-
-#         if name:
-#             if re.search(constants.REGEXP_LTD_INC, name, re.IGNORECASE):
-#                 type_author = "Organization"
-#                 author_info = {
-#                     "name": name,
-#                     "email": email,
-#                     "value": name,
-#                     "type": type_author
-#                 }
-#             else:
-#                 type_author = "Person"
-#                 name_parts = name.split()
-#                 given_name = name_parts[0] if name_parts else None
-#                 last_name = " ".join(name_parts[1:]) if len(name_parts) > 1 else None
-#                 author_info = {
-#                     "name": name,
-#                     "email": email,
-#                     "value": name,
-#                     "type": type_author,
-#                     "given_name": given_name,
-#                     "last_name": last_name
-#                 }
-
-#             authors.append(author_info)
-
-#     return authors
diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py
@@ -470,5 +470,85 @@ def test_issue_859(self):
         os.remove(test_data_path + "test-859.json")
 
 
+    def test_issue_725(self):
+            """Checks if this repository has authors extracted from Dockerfile"""
+
+            somef_cli.run_cli(threshold=0.8,
+                                ignore_classifiers=False,
+                                repo_url=None,
+                                local_repo=test_data_repositories + "Fairwinds",
+                                doc_src=None,
+                                in_file=None,
+                                output=test_data_path + "test_issue_725.json",
+                                graph_out=None,
+                                graph_format="turtle",
+                                codemeta_out=None,
+                                pretty=True,
+                                missing=False,
+                                readme_only=False)
+            
+            text_file = open(test_data_path + "test_issue_725.json", "r")
+            data = text_file.read()
+            text_file.close()
+            json_content = json.loads(data)
+
+            has_built = json_content.get("has_build_file", [])
+
+            authors = []
+            for entry in has_built:
+                result = entry.get("result", {})
+                if "author" in result:
+                    authors.extend(result["author"])
+    
+            expected_author = "FairwindsOps, Inc."
+
+            assert expected_author in authors, (
+                f"Expected author '{expected_author}' not found. "
+                f"Authors found: {authors}"
+            )
+            os.remove(test_data_path + "test_issue_725.json")
+
+    def test_issue_725_2(self):
+            """Checks if this repository has authors extracted from Dockerfile"""
+
+            somef_cli.run_cli(threshold=0.8,
+                                ignore_classifiers=False,
+                                repo_url=None,
+                                local_repo=test_data_repositories + "Prometeus",
+                                doc_src=None,
+                                in_file=None,
+                                output=test_data_path + "test_issue_725_2.json",
+                                graph_out=None,
+                                graph_format="turtle",
+                                codemeta_out=None,
+                                pretty=True,
+                                missing=False,
+                                readme_only=False)
+            
+            text_file = open(test_data_path + "test_issue_725_2.json", "r")
+            data = text_file.read()
+            text_file.close()
+            json_content = json.loads(data)
+
+            has_built = json_content.get("has_build_file", [])
+
+            authors = []
+            for entry in has_built:
+                result = entry.get("result", {})
+                if "author" in result:
+                    authors.extend(result["author"])
+    
+            expected_author = "The Prometheus Authors"
+
+            assert expected_author in authors, (
+                f"Expected author '{expected_author}' not found. "
+                f"Authors found: {authors}"
+            )
+            expected_count = 2
+            assert len(authors) == expected_count, (
+                f"Expected {expected_count} authors, but found {len(authors)}: {authors}"
+            )
+            os.remove(test_data_path + "test_issue_725_2.json")
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/src/somef/test/test_data/repositories/Fairwinds/Dockerfile b/src/somef/test/test_data/repositories/Fairwinds/Dockerfile
@@ -0,0 +1,14 @@
+FROM alpine:3.23
+
+LABEL org.opencontainers.image.authors="FairwindsOps, Inc." \
+      org.opencontainers.image.vendor="FairwindsOps, Inc." \
+      org.opencontainers.image.title="Nova" \
+      org.opencontainers.image.description="Nova is a cli tool to find outdated or deprecated Helm charts running in your Kubernetes cluster." \
+      org.opencontainers.image.documentation="https://nova.docs.fairwinds.com/" \
+      org.opencontainers.image.source="https://github.com/FairwindsOps/nova" \
+      org.opencontainers.image.url="https://github.com/FairwindsOps/nova" \
+      org.opencontainers.image.licenses="Apache License 2.0"
+
+USER nobody
+COPY nova /
+CMD ["/nova"]
diff --git a/src/somef/test/test_data/repositories/Prometeus/Dockerfile b/src/somef/test/test_data/repositories/Prometeus/Dockerfile
@@ -0,0 +1,31 @@
+ARG ARCH="amd64"
+ARG OS="linux"
+FROM quay.io/prometheus/busybox-${OS}-${ARCH}:latest
+LABEL maintainer="The Prometheus Authors <prometheus-developers@googlegroups.com>"
+LABEL org.opencontainers.image.authors="The Prometheus Authors" \
+      org.opencontainers.image.vendor="Prometheus" \
+      org.opencontainers.image.title="Prometheus" \
+      org.opencontainers.image.description="The Prometheus monitoring system and time series database" \
+      org.opencontainers.image.source="https://github.com/prometheus/prometheus" \
+      org.opencontainers.image.url="https://github.com/prometheus/prometheus" \
+      org.opencontainers.image.documentation="https://prometheus.io/docs" \
+      org.opencontainers.image.licenses="Apache License 2.0"
+
+ARG ARCH="amd64"
+ARG OS="linux"
+COPY .build/${OS}-${ARCH}/prometheus        /bin/prometheus
+COPY .build/${OS}-${ARCH}/promtool          /bin/promtool
+COPY documentation/examples/prometheus.yml  /etc/prometheus/prometheus.yml
+COPY LICENSE                                /LICENSE
+COPY NOTICE                                 /NOTICE
+COPY npm_licenses.tar.bz2                   /npm_licenses.tar.bz2
+
+WORKDIR /prometheus
+RUN chown -R nobody:nobody /etc/prometheus /prometheus && chmod g+w /prometheus
+
+USER       nobody
+EXPOSE     9090
+VOLUME     [ "/prometheus" ]
+ENTRYPOINT [ "/bin/prometheus" ]
+CMD        [ "--config.file=/etc/prometheus/prometheus.yml", \
+             "--storage.tsdb.path=/prometheus" ]
diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py
@@ -454,3 +454,10 @@ class RepositoryType(Enum):
 CAT_CODEMETA_SOFTWAREREQUIREMENTS = "softwareRequirements"
 CAT_CODEMETA_SOFTWAREVERSION = "softwareVersion"
 CAT_CODEMETA_URL = "url"
+
+
+# DOCKER labels maintainer
+# REGEXP_MAINTAINER_LABEL_OCI = r'^\s*LABEL\s+org\.opencontainers\.image\.authors\s*=\s*["\']?(.+?)["\']?\s*$'
+REGEXP_MAINTAINER_LABEL_OCI = r'^\s*LABEL\s+org\.opencontainers\.image\.authors\s*=\s*["\']([^"\'\\]+)["\']?\s*(?:\\)?\s*$'
+REGEXP_MAINTAINER_LABEL_FREE = r'^\s*LABEL\s+"?maintainer"?\s*=\s*["\']?(.+?)["\']?\s*$'
+REGEXP_MAINTAINER = r'^\s*MAINTAINER\s+(.+)$'