-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtest_waybackprov.py
More file actions
63 lines (47 loc) · 1.58 KB
/
test_waybackprov.py
File metadata and controls
63 lines (47 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from waybackprov import get_collection, get_crawls, get_depth, deepest_collection, cdx
def test_coll():
coll = get_collection("ArchiveIt-Collection-2410")
assert coll["title"] == "University of Maryland"
def test_get_crawls():
crawls = list(get_crawls("https://mith.umd.edu"))
assert len(crawls) > 0
assert crawls[0]["timestamp"]
assert crawls[0]["url"]
assert crawls[0]["status"]
assert crawls[0]["collections"]
assert len(crawls[0]["collections"]) > 0
def test_depth():
assert get_depth("ArchiveIt-Collection-2410") == 4
assert get_depth("wikipediaoutlinks00003") == 3
def test_deepest_collection():
colls = [
"ArchiveIt-Partner-408",
"archiveitdigitalcollection",
"web",
"archiveitpartners",
"ArchiveIt-Collection-2410",
]
assert deepest_collection(colls) == "ArchiveIt-Collection-2410"
def test_loop():
# weirdly, some collections can contain themselves when there is a loop
# e.g. coll1 ∃ coll2 and coll2 ∃ coll1
assert get_depth("ArchiveIt-Partner-1140") == 3
def test_prefix():
crawls = get_crawls(
"https://twitter.com/Guccifer_2", prefix=True, match="/status/\d+$"
)
crawl = next(crawls)
assert crawl["url"]
def test_cdx():
urls = cdx(
"https://twitter.com/Guccifer_2",
match="/status/\d+$",
start_year=2016,
end_year=2018,
)
assert len(list(urls)) == 132
def test_missing():
crawls = list(
get_crawls("https://twitter.com/slavresistance/status/1016697918970105857/")
)
assert len(crawls) == 0