Skip to content

Commit c9c0f4d

Browse files
committed
Full commit of files from neotoma_doi
1 parent 210d708 commit c9c0f4d

44 files changed

Lines changed: 2193 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,7 @@
22
.env
33

44
.coverage
5+
6+
src/neotomadoi/__pycache__/
7+
8+
*.log

example.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import neotomadoi
2+
from dotenv import load_dotenv
3+
import os
4+
import json
5+
import psycopg2
6+
import psycopg2.extras
7+
8+
load_dotenv()
9+
10+
DCITE = json.loads(os.getenv("DCITE"))
11+
12+
datacite_meta = neotomadoi.credentials(DCITE)
13+
14+
con = neotomadoi.neo_connect(test=False)
15+
16+
# All datasets that are between two months and two days old.
17+
# Datasets cannot be geochronologic datasets.
18+
query = """SELECT DISTINCT ds.datasetid
19+
FROM ndb.datasets AS ds
20+
LEFT JOIN ndb.datasetdoi AS dsdoi ON dsdoi.datasetid = ds.datasetid
21+
WHERE NOT ds.datasettypeid = 1;"""
22+
23+
with con.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
24+
cur.execute(query)
25+
datasetids = cur.fetchall()
26+
datasetids = [i[0] for i in datasetids]
27+
28+
for i in datasetids:
29+
print(f"Working on {i}")
30+
new_doi = neotomadoi.neotomaDOI(datasetid=i, defaults="neotomadoi.yaml")
31+
new_doi.set_user(datacite_meta)
32+
new_doi.prod_mode()
33+
try:
34+
try:
35+
new_doi.update()
36+
except ValueError as e:
37+
if "critical" in str(e):
38+
new_doi.freeze_data(con)
39+
new_doi.update()
40+
new_doi.validate()
41+
new_doi.get_activity()
42+
old_activity = len(new_doi.activity)
43+
new_doi.mint_doi(publish=True)
44+
if old_activity == 0:
45+
with open("minting_dois.log", "a", encoding="UTF-8") as f:
46+
new_doi.get_meta()
47+
json.dump(
48+
{"datasetid": i, "doi": new_doi.identifiers, "meta": new_doi.meta},
49+
f,
50+
)
51+
a = f.write("\n")
52+
print(f' Minted new DOI: {new_doi.identifiers.get('identifier')}')
53+
elif old_activity > 0:
54+
with open("updating_dois.log", "a", encoding="UTF-8") as f:
55+
new_doi.get_meta()
56+
json.dump(
57+
{"datasetid": i, "doi": new_doi.identifiers, "meta": new_doi.meta},
58+
f,
59+
)
60+
a = f.write("\n")
61+
print(f' Updated DOI: {new_doi.identifiers.get('identifier')}')
62+
except Exception as e:
63+
print("Whoops.")
64+
print(e)
65+
with open("failing_dois.log", "a", encoding="UTF-8") as f:
66+
json.dump({"datasetid": i, "error": str(e)}, f)
67+
a = f.write("\n")

neotomadoi.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
language: EN
2+
rightsList:
3+
- rights: CC-BY4
4+
rightsUri: http://creativecommons.org/licenses/by/4.0/deed.en_US
5+
schemeUri: https://spdx.org/licenses/
6+
rightsIdentifierScheme: SPDX
7+
- rights: TK Open to Collaboration (TK CB)
8+
rightsUri: https://localcontexts.org/notice/open-to-collaborate/
9+
schemeUri: https://localcontexts.org
10+
rightsIdentifierScheme: Local Contexts
11+
schemaVersion: "http://datacite.org/schema/kernel-4"
12+
types:
13+
resourceType: Dataset/Paleoecological Sample Data
14+
resourceTypeGeneral: Dataset
15+
publisher:
16+
name: Neotoma Paleoecology Database
17+
publisherIdentifier: 10.17616/R3PD38
18+
publisherIdentifierScheme: DOI
19+
lang: EN
20+
subjects:
21+
- subject: Paleoecology
22+
subjectScheme: Library of Congress Subject Headings
23+
schemeUri: https://id.loc.gov/authorities/subjects
24+
valueUri: http://id.loc.gov/authorities/subjects/sh85097060
25+
lang: en-us
26+
formats:
27+
- application/json

src/neotomadoi/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from .neo_connect import neo_connect as neo_connect
2+
from .neo_creators import neo_creators as neo_creators
3+
from .neotomaDOI import neotomaDOI as neotomaDOI
4+
from .neotomaDOI import testMode as testMode
5+
from .neotomaDOI import credentials as credentials
6+
from .neotomaDOI import activity as activity
7+
from .neo_contributors import neo_contributors as neo_contributors
8+
from .neo_subjects import neo_subjects as neo_subjects
9+
from .neo_title import neo_title as neo_title
10+
from .neo_location import neo_location as neo_location
11+
from .neo_relatedIdentifiers import neo_relatedIdentifiers as neo_relatedIdentifiers
12+
from .neo_identifier import neo_identifier as neo_identifier
13+
from .neo_dates import neo_dates as neo_dates
14+
from .neo_size import neo_size as neo_size
15+
from .neo_description import neo_description as neo_description

src/neotomadoi/neo_connect.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import psycopg2
2+
from dotenv import dotenv_values
3+
from json import loads
4+
5+
6+
def neo_connect(test: bool = True) -> psycopg2.connect:
7+
"""_Connect to the Neotoma Database_
8+
9+
Args:
10+
test (bool): _Are we connecting to the test or production database?_
11+
12+
Returns:
13+
psycopg2.connect: _A valid connection the the Neotoma Database server_
14+
"""
15+
secrets = dotenv_values()
16+
if test:
17+
CONN_STRING = loads(secrets["DBAUTH_TEST"])
18+
else:
19+
CONN_STRING = loads(secrets["DBAUTH"])
20+
con = psycopg2.connect(**CONN_STRING, connect_timeout=5)
21+
return con

src/neotomadoi/neo_contributors.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import psycopg2
2+
import psycopg2.extras
3+
4+
5+
def neo_contributors(con: psycopg2.connect, self) -> list:
6+
"""_Obtain a list of the dataset contributors by activity for a dataset._
7+
8+
Args:
9+
con (psycopg2.connect): _A valid connection the the Neotoma Database server_
10+
11+
Returns:
12+
list: _A list of Neotoma contributors, including external identifiers when available._
13+
"""
14+
query = """
15+
WITH chronfolk AS (
16+
SELECT DISTINCT contactid,
17+
'Researcher'::text AS contributorType
18+
FROM ndb.datasets AS d
19+
JOIN ndb.chronologies AS chron ON d.collectionunitid = chron.collectionunitid
20+
WHERE d.datasetid = %(datasetid)s
21+
),
22+
collfolk AS (
23+
SELECT DISTINCT contactid, 'DataCollector'::text AS contributortype
24+
FROM ndb.datasets AS d
25+
JOIN ndb.collectors AS coll ON d.collectionunitid = coll.collectionunitid
26+
WHERE d.datasetid = %(datasetid)s
27+
),
28+
dpi AS (
29+
SELECT DISTINCT contactid,
30+
'ProjectLeader'::text AS contributortype
31+
FROM ndb.datasetpis WHERE datasetpis.datasetid = %(datasetid)s
32+
),
33+
curator AS (
34+
/* In the DB stuff this should be a 'DataSteward' */
35+
SELECT DISTINCT contactid, 'DataCurator'::text AS contributortype
36+
FROM ndb.datasetsubmissions
37+
WHERE datasetsubmissions.datasetid = %(datasetid)s
38+
),
39+
coauth AS (
40+
SELECT DISTINCT contactid,
41+
'Researcher'::text AS contributortype
42+
FROM ndb.datasetpublications AS d
43+
JOIN ndb.publicationauthors AS paut ON d.publicationid = paut.publicationid
44+
WHERE d.datasetid = %(datasetid)s
45+
),
46+
analyst AS (
47+
SELECT DISTINCT sana.contactid,
48+
/* In the DB stuff this should be a 'DataAnalyst' */
49+
'DataCollector'::text AS contributortype
50+
FROM ndb.samples AS samp
51+
JOIN ndb.sampleanalysts AS sana ON samp.sampleid = sana.sampleid
52+
WHERE samp.datasetid = %(datasetid)s
53+
)
54+
SELECT DISTINCT cts.contactname AS name,
55+
-- cts.address AS affiliation,
56+
lister.contributortype as "contributorType",
57+
jsonb_agg(DISTINCT
58+
jsonb_build_object('nameIdentifier', exct.identifier,
59+
'nameIdentifierScheme', exdb.extdatabasename,
60+
'schemeUri', exdb.url)) AS "nameIdentifiers"
61+
FROM (SELECT * FROM analyst
62+
UNION ALL
63+
(SELECT * FROM coauth)
64+
UNION ALL
65+
(SELECT * FROM curator)
66+
UNION ALL
67+
(SELECT * FROM dpi)
68+
UNION ALL
69+
(SELECT * FROM collfolk)
70+
UNION ALL
71+
(SELECT * FROM chronfolk)) AS lister
72+
JOIN ndb.contacts AS cts ON cts.contactid = lister.contactid
73+
LEFT JOIN ndb.externalcontacts AS exct ON exct.contactid = cts.contactid
74+
LEFT JOIN ndb.externaldatabases AS exdb ON exdb.extdatabaseid = exct.extdatabaseid
75+
GROUP BY cts.contactid, lister.contributortype;
76+
"""
77+
with con.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
78+
cur.execute(query, {"datasetid": self.datasetid})
79+
response = cur.fetchall()
80+
contributors = []
81+
for i in response:
82+
creator = dict(i)
83+
if not all(
84+
[i.get("nameIdentifier") for i in creator.get("nameIdentifiers")]
85+
):
86+
_ = creator.pop("nameIdentifiers", None)
87+
contributors.append(creator)
88+
return contributors

src/neotomadoi/neo_creators.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import psycopg2
2+
import psycopg2.extras
3+
4+
5+
def neo_creators(con: psycopg2.connect, self) -> list:
6+
"""_Obtain a list of Neotoma dataset PIs for a dataset._
7+
8+
Args:
9+
con (psycopg2.connect): _A valid psycopg connection to the Neotoma database._
10+
11+
Returns:
12+
list: _A list of dataset PIs, including any external identifiers._
13+
"""
14+
15+
query = """
16+
SELECT DISTINCT cts.contactname AS name,
17+
-- cts.address AS affiliation,
18+
jsonb_agg(DISTINCT
19+
jsonb_build_object('nameIdentifier', exct.identifier,
20+
'nameIdentifierScheme', exdb.extdatabasename,
21+
'schemeUri', exdb.url)) AS "nameIdentifiers"
22+
FROM ndb.datasetpis AS dspi
23+
INNER JOIN ndb.contacts AS cts ON cts.contactid = dspi.contactid
24+
LEFT JOIN ndb.externalcontacts AS exct ON exct.contactid = cts.contactid
25+
LEFT JOIN ndb.externaldatabases AS exdb ON exdb.extdatabaseid = exct.extdatabaseid
26+
WHERE dspi.datasetid = %(datasetid)s
27+
GROUP BY cts.contactid;
28+
"""
29+
30+
with con.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
31+
cur.execute(query, {"datasetid": self.datasetid})
32+
response = cur.fetchall()
33+
creators = []
34+
if len(response) == 0:
35+
creators = [{"name": "None listed"}]
36+
for i in response:
37+
creator = dict(i)
38+
if creator.get("name") is None:
39+
creator["name"] = "None listed"
40+
if not all(
41+
[i.get("nameIdentifier") for i in creator.get("nameIdentifiers")]
42+
):
43+
_ = creator.pop("nameIdentifiers", None)
44+
creators.append(creator)
45+
return creators

src/neotomadoi/neo_dates.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import psycopg2
2+
import psycopg2.extras
3+
4+
5+
def neo_dates(con: psycopg2.connect, self) -> object:
6+
"""_Return critical dates associated with the dataset record._
7+
8+
Args:
9+
con (psycopg2.connect): _A valid connection to the Neotoma database._
10+
11+
Returns:
12+
object: _A object listing each date type (Submitted, Updated, etc.) and the relevant date._
13+
"""
14+
query = """
15+
WITH creation AS (
16+
SELECT MIN(ds.submissiondate)::date as date, 'Submitted'::text
17+
FROM ndb.datasetsubmissions AS ds
18+
WHERE ds.datasetid = %(datasetid)s
19+
),
20+
resub AS (
21+
SELECT ds.submissiondate as date, 'Updated'::text
22+
FROM ndb.datasetsubmissions AS ds
23+
WHERE ds.datasetid = %(datasetid)s
24+
ORDER BY ds.submissiondate
25+
OFFSET 1
26+
),
27+
issued AS (
28+
SELECT dsdoi.recdatecreated as date, 'Issued'::text
29+
FROM ndb.datasetdoi AS dsdoi
30+
WHERE dsdoi.datasetid = %(datasetid)s
31+
ORDER BY dsdoi.recdatecreated
32+
LIMIT 1
33+
)
34+
SELECT DISTINCT *
35+
FROM (
36+
(SELECT * FROM creation)
37+
UNION ALL
38+
(SELECT * FROM resub)
39+
UNION ALL
40+
(SELECT * FROM issued)) AS dates
41+
WHERE date is not NULL;
42+
"""
43+
44+
with con.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
45+
cur.execute(query, {"datasetid": self.datasetid})
46+
response = cur.fetchall()
47+
dates = []
48+
for i in response:
49+
dates.append(dict(i))
50+
date_out = []
51+
for i in dates:
52+
date_out.append(
53+
{"dateType": i.get("text"), "date": i.get("date").strftime("%Y-%m-%d")}
54+
)
55+
return date_out

src/neotomadoi/neo_description.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import psycopg2
2+
import psycopg2.extras
3+
4+
5+
def neo_description(con: psycopg2.connect, self) -> object:
6+
"""_Return a formatted description string for the dataset to be used in the DOI metadata._
7+
8+
Args:
9+
con (psycopg2.connect): _A valid connection to the Neotoma database._
10+
11+
Returns:
12+
object: _An object with the description and description type._
13+
"""
14+
query = """
15+
SELECT st.sitename || ' ' || dst.datasettype || ' dataset' AS title
16+
FROM
17+
ndb.datasets AS ds
18+
INNER JOIN ndb.datasettypes AS dst ON dst.datasettypeid = ds.datasettypeid
19+
INNER JOIN ndb.collectionunits AS cu ON cu.collectionunitid = ds.collectionunitid
20+
INNER JOIN ndb.sites AS st ON st.siteid = cu.siteid
21+
WHERE ds.datasetid = %(datasetid)s;
22+
"""
23+
24+
with con.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
25+
cur.execute(query, {"datasetid": self.datasetid})
26+
response = cur.fetchone()
27+
string = (
28+
f"Raw data for the {response[0]} submitted to the Neotoma Paleoecology Database. Data is available through the landing page in JSON format. "
29+
"The landing page referenced by the DOI also contains links to publications and a map-based viewer for the dataset. "
30+
"The Neotoma Paleoecology Database maintains a homepage at https://www.neotomadb.org."
31+
)
32+
description = [
33+
{"descriptionType": "Abstract", "description": string, "lang": "EN"}
34+
]
35+
return description

src/neotomadoi/neo_identifier.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import psycopg2
2+
import psycopg2.extras
3+
4+
5+
def neo_identifier(con: psycopg2.connect, self) -> object:
6+
"""_Return the dataset identifier (DOI)_
7+
8+
Args:
9+
con (psycopg2.connect): _A valid connection the the Neotoma database._
10+
11+
Returns:
12+
object: _An object with the dataset DOI._
13+
"""
14+
query = """
15+
SELECT doi as identifier,
16+
'DOI' as "identifierType"
17+
FROM doi.doimeta
18+
WHERE datasetid = %(datasetid)s
19+
LIMIT 1;
20+
"""
21+
22+
with con.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
23+
cur.execute(query, {"datasetid": self.datasetid})
24+
response = cur.fetchone()
25+
if response:
26+
doi = dict(response)
27+
else:
28+
doi = {}
29+
return doi

0 commit comments

Comments
 (0)