KnowledgeCaptureAndDiscovery · dgarijo · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/README.md b/README.md
@@ -35,7 +35,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo
 - **Contact**: Contact person responsible for maintaining a software component
 - **Continuous integration**: Link to continuous integration service(s)
 - **Contribution guidelines**: Text indicating how to contribute to this code repository
-- **Contributors**: Contributors to a software component
+- **Contributors**: Contributors to a software component. Note: Due to the potentially large number of contributors, information is extracted directly from the repository file.
 - **Creation date**: Date when the repository was created
 - **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available.
 - **Date updated**: Date of last release.

diff --git a/docs/codemetajson.md b/docs/codemetajson.md
@@ -15,6 +15,12 @@ These fields are defined in the [Codemeta specification](https://github.com/code
 | citation - doi                   |   citation[i].result.doi       |     referencePublication.identifier   | 
 | code_repository           |   code_repository[i].result.value   |     codeRepository           |
 | continuous_integration    |   continuous_integration[i].result.value |     contIntegration          |
+| contributors - value   |   contributors[i].result.value |     contributor.givenName + contributor.familyName or just name if organization       |
+| contributors - name  |   contributors[i].result.value |     contributor.givenName + contributor.familyName or just name if organization       |
+| contributors - last_name  |   contributors[i].result.value |     contributor.familyName         |
+| contributors - given_name  |   contributors[i].result.value |     contributor.givenName         |
+| contributors - identifier  |   contributors[i].result.value |     contributor.@id          |
+| contributors - email   |   contributors[i].result.value |     contributor.email         |
 | date_created              |   date_created[i].result.value  |     dateCreated              |
 | date_updated              |   date_updated[i].result.value   |     dateModified             |
 | date_published            |   date_published[i].result .value   |     datePublished  |

diff --git a/docs/index.md b/docs/index.md
@@ -42,7 +42,7 @@ Given a readme file (or a GitHub repository) SOMEF will extract the following ca
 - **Contact**: Contact person responsible for maintaining a software component
 - **Continuous integration**: Link to continuous integration service(s)
 - **Contribution guidelines**: Text indicating how to contribute to this code repository
-- **Contributors**: Contributors to a software component
+- **Contributors**: Contributors to a software component. Note: Contributor metadata is exported from metadata files (e.g., CodeMeta, CONTRIBUTORS, etc.) not from git logs.
 - **Copyright holder**: Entity or individual owning the rights to the software. The year is also extracted, if available.
 - **Creation date**: Date when the repository was created
 - **Date updated**: Date of last release.

diff --git a/docs/output.md b/docs/output.md
@@ -74,7 +74,7 @@ SOMEF aims to recognize the following categories (in alphabetical order):
 - `contact`: Contact person responsible for maintaining a software component.
 - `continuous_integration`: Link to continuous integration service, supported on GitHub as well as in GitLab.
 - `contributing guidelines`: Guidelines indicating how to contribute to a software component.
-- `contributors`: Contributors to a software component
+- `contributors`: Contributors to a software component. Note: Due to the potentially large number of contributors, information is extracted directly from the repository file.
 - `copyright_holder`: Entity or individual owning the rights to the software. The year is also extracted, if available.
 - `date_created`: Date when the software component was created.
 - `date_updated`: Date when the software component was last updated (note that this will always be older than the date of the extraction).

diff --git a/poetry.lock b/poetry.lock
diff --git a/src/somef/export/json_export.py b/src/somef/export/json_export.py
@@ -581,8 +581,10 @@ def format_date(date_string):
         if runtimes:
             codemeta_output[constants.CAT_CODEMETA_RUNTIMEPLATFORM] = ", ".join(runtimes)
 
-    # if "contributors" in repo_data:
-    #     codemeta_output["contributor"] = data_path(["contributors", "excerpt"])
+    if constants.CAT_CONTRIBUTORS in repo_data:
+        raw_contributors = repo_data[constants.CAT_CONTRIBUTORS]
+        codemeta_output[constants.CAT_CODEMETA_CONTRIBUTOR] = parse_contributors(raw_contributors)
+
     # A person is expected, and we extract text at the moment
     if descriptions_text:
         codemeta_output[constants.CAT_CODEMETA_DESCRIPTION] = descriptions_text
@@ -684,6 +686,74 @@ def map_requirement_type(t):
     # default
     return constants.SCHEMA_SOFTWARE_APPLICATION
 
+def parse_contributors(raw):
+    contributors = []
+    seen = set()
+
+    for entry in raw:
+        result = entry.get("result", {})
+        rtype = result.get("type")
+        name = result.get("value")
+
+        if not name:
+            continue
+
+        if rtype == "Agent":
+
+            if name not in seen:
+
+                if re.search(constants.REGEXP_LTD_INC, name, re.IGNORECASE):
+                    type_contributor = "Organization"
+                else:
+                    type_contributor = "Person"
+
+                contributor = {
+                    "@type": type_contributor,
+                    "name": name
+                }
+                if "given_name" in result:
+                    contributor["givenName"] = result["given_name"]
+
+                if "last_name" in result:
+                    contributor["familyName"] = result["last_name"]
+
+                if "email" in result:
+                    contributor["email"] = result["email"]
+
+                if "identifier" in result:
+                    contributor["@id"] = result["identifier"]
+
+                contributors.append(contributor)
+                seen.add(name)
+
+        if rtype == "File_dump":
+            for line in result.get("value", "").splitlines():
+                line = line.strip()
+
+                if (not line or line.startswith(("#", "##", "|")) or "[" in line):
+                    continue
+
+                # avoid sentences
+                if len(line.split()) > 4: 
+                    continue
+
+                if line in seen:
+                    continue
+
+                if re.search(constants.REGEXP_LTD_INC, line, re.IGNORECASE):
+                    type_contributor = "Organization"
+                else:
+                    type_contributor = "Person"
+
+                contributors.append({
+                    "@type": type_contributor,
+                    "name": line
+                })
+
+                seen.add(line)
+
+
+    return contributors
 
 
 """

diff --git a/src/somef/parser/codemeta_parser.py b/src/somef/parser/codemeta_parser.py
@@ -234,6 +234,82 @@ def parse_programming_language(language_data):
 
     return None
 
+def parse_contributors(contributors_data):
+    """
+    Parse contributors from codemeta.json
+
+    Parameters
+    ----------
+    contributors_data: list, dict
+        Contributor data from codemeta.json
+
+    Returns
+    -------
+    list
+        List of contributor dictionaries
+    """
+    contributors_list = []
+
+    if isinstance(contributors_data, dict):
+        contributors_data = [contributors_data]
+
+    if not isinstance(contributors_data, list):
+        return contributors_list
+
+    for contributor in contributors_data:
+
+        if isinstance(contributor, dict):
+
+            given = contributor.get("givenName")
+            family = contributor.get("familyName")
+            name = contributor.get("name")
+
+            if given and family:
+                full_name = f"{given} {family}"
+            elif name:
+                full_name = name
+            else:
+                continue
+
+            contributor_info = {
+                "value": full_name,
+                "name": full_name,
+                "type": constants.AGENT
+            }
+
+            if given:
+                contributor_info["given_name"] = given
+
+            if family:
+                contributor_info["last_name"] = family
+
+            if "email" in contributor:
+                contributor_info["email"] = contributor["email"]
+
+            affil = contributor.get("affiliation")
+            if affil:
+                if isinstance(affil, dict) and affil.get("name"):
+                    contributor_info["affiliation"] = affil["name"]
+                elif isinstance(affil, str):
+                    contributor_info["affiliation"] = affil
+
+            identifier = contributor.get("identifier") or contributor.get("@id")
+            if identifier:
+                contributor_info["identifier"] = identifier
+
+            contributors_list.append(contributor_info)
+
+        elif isinstance(contributor, str):
+            name = contributor.strip()
+            if name:
+                contributors_list.append({
+                    "value": name,
+                    "type": constants.AGENT
+                })
+
+    return contributors_list
+
+
 def parse_codemeta_json_file(file_path, metadata_result: Result, source):
     """
 
@@ -290,6 +366,17 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source):
                     source
                 )
 
+            if "contributor" in data:
+                contributors = parse_contributors(data["contributor"])
+                for contributor in contributors:
+                    metadata_result.add_result(
+                        constants.CAT_CONTRIBUTORS,
+                        contributor,
+                        1,
+                        constants.TECHNIQUE_CODE_CONFIG_PARSER,
+                        source
+                    )
+
             if "issueTracker" in data:
                 metadata_result.add_result(
                     constants.CAT_ISSUE_TRACKER,
@@ -570,7 +657,7 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source):
                             if author_name:
                                 author_info = {
                                     "value": author_name,
-                                    "type": constants.STRING
+                                    "type": constants.AGENT
                                 }
 
                                 if "email" in author:
@@ -604,7 +691,7 @@ def parse_codemeta_json_file(file_path, metadata_result: Result, source):
                     if author_name:
                         author_info = {
                             "value": author_name,
-                            "type": constants.STRING
+                            "type": constants.AGENT
                         }
 
                         if "email" in author:

diff --git a/src/somef/test/test_codemeta_export.py b/src/somef/test/test_codemeta_export.py
@@ -618,6 +618,55 @@ def test_issue_886_apache_code(self):
 
         os.remove(test_data_path + "test_issue_886_apache_code.json")
 
+
+
+    def test_issue_936_contributors(self):
+        """Checks whether contributors are correctly extracted from the repository"""
+        somef_cli.run_cli(threshold=0.8,
+                            ignore_classifiers=False,
+                            repo_url=None,
+                            local_repo=test_data_repositories + "codemeta_repo",
+                            doc_src=None,
+                            in_file=None,
+                            output=None,
+                            graph_out=None,
+                            graph_format="turtle",
+                            codemeta_out=test_data_path + "test_issue_936_contributors.json",
+                            pretty=True,
+                            missing=False,
+                            readme_only=False)
+
+        text_file = open(test_data_path + "test_issue_936_contributors.json", "r")
+        data = text_file.read()
+        text_file.close()
+        json_content = json.loads(data)
+
+        contributors = json_content[constants.CAT_CODEMETA_CONTRIBUTOR]
+        print(contributors)
+        self.assertTrue(any(
+            c["name"] == "Abby Cabunoc Mayes" and
+            c.get("givenName") == "Abby Cabunoc"
+            for c in contributors
+        ),
+        "Expected contributor Abby Cabunoc Mayes with givenName='Abby Cabunoc' not found")
+
+        self.assertTrue(any(
+            c["name"] == "Arfon Smith" and
+            c.get("@id") == "http://orcid.org/0000-0002-3957-2474"
+            for c in contributors
+        ),
+        "Expected contributor Arfon Smith with @id='http://orcid.org/0000-0002-3957-2474' not found")
+
+        self.assertTrue(any(
+            c["name"] == "Dan Katz" and
+            c.get("email") == "dskatz@illinois.edu"
+            for c in contributors
+        ),
+        "Expected contributor Dan Katz with email='dskatz@illinois.edu' not found")
+
+        os.remove(test_data_path + "test_issue_936_contributors.json")
+
+
     @classmethod
     def tearDownClass(cls):
         """delete temp file JSON just if all the test pass"""

diff --git a/src/somef/test/test_codemeta_parser.py b/src/somef/test/test_codemeta_parser.py
@@ -17,6 +17,8 @@ def load_expected(self, repo_name):
         """Load expected YAML for a given repo."""
         yaml_path = EXPECT_DIR / f"{repo_name}.yaml"
         if not yaml_path.exists():
+            if repo_name == "codemeta_repo":
+                return {}
             self.skipTest(f"No expected YAML for repository '{repo_name}'")
         with open(yaml_path, "r", encoding="utf-8") as f:
             return yaml.safe_load(f)
@@ -62,5 +64,34 @@ def test_parse_multiple_codemeta_files(self):
                             f"[{repo_folder}] Mismatch in {cat_name}"
                         )
 
+
+    def test_parse_contributors(self):
+        codemeta_path = REPOS_DIR / "codemeta_repo" / "codemeta.json"
+        result = Result()
+
+        metadata_result = parse_codemeta_json_file(codemeta_path, result, "https://example.org/codemeta.json")
+
+        self.assertIn(constants.CAT_CONTRIBUTORS, metadata_result.results)
+        contributors = result.results[constants.CAT_CONTRIBUTORS]
+
+        self.assertTrue(any(
+            c["result"]["name"] == "Abby Cabunoc Mayes" and
+            c["result"].get("given_name") == "Abby Cabunoc"
+            for c in contributors
+        ))
+
+        self.assertTrue(any(
+            c["result"]["name"] == "Arfon Smith" and
+            c["result"].get("identifier") == "http://orcid.org/0000-0002-3957-2474"
+            for c in contributors
+        ))
+
+        self.assertTrue(any(
+            c["result"]["name"] == "Dan Katz" and
+            c["result"].get("email") == "dskatz@illinois.edu"
+            for c in contributors
+        ))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/src/somef/test/test_data/expected/aladin-lite.yaml b/src/somef/test/test_data/expected/aladin-lite.yaml
@@ -15,7 +15,7 @@ CAT_IDENTIFIER: 10.5281/zenodo.7638833 # Passed
 CAT_DESCRIPTION: An astronomical HiPS visualizer in the browser. # Passed
 CAT_AUTHORS: # Passed
   value: Matthieu Baumann
-  type: String
+  type: Agent
   email: matthieu.baumann@unistra.fr
   affiliation: "Universit\u00e9 de Strasbourg, CNRS, Observatoire astronomique de Strasbourg, UMR 7550, F-67000 Strasbourg, France"
   identifier: "https://orcid.org/0000-0002-7123-773X"

diff --git a/src/somef/test/test_data/expected/gammapy.yaml b/src/somef/test/test_data/expected/gammapy.yaml
@@ -28,4 +28,4 @@ CAT_REQUIREMENTS: # Passed
 
 CAT_AUTHORS: # Passed
     value: Fabio Acero
-    type: String    
+    type: Agent   
diff --git a/src/somef/test/test_data/expected/r3broot2.yaml b/src/somef/test/test_data/expected/r3broot2.yaml
@@ -36,7 +36,7 @@ CAT_REQUIREMENTS: # Passed
   version: Null
 CAT_AUTHORS: # Passed
   value: "Jose Luis Rodr\u00edguez-S\u00e1nchez"
-  type: String
+  type: Agent
   email: j.l.rodriguez.sanchez@udc.es
   affiliation: "CITENI, Industrial Campus of Ferrol, University of Coruña, 15403 Ferrol, Spain" # Passed
   identifier: https://orcid.org/0000-0002-4702-5294