Skip to content

Commit 2dc05a4

Browse files
macdiceCommitfest Bot
authored andcommitted
Refactor patch/tarball detection.
It's not ideal yet, but previously we couldn't handle: https://www.postgresql.org/message-id/CAApHDvrF6DG7%3DxD8JGo2HoQKN0LRFNF0ysVt6cKSNPiqbdQOSA%40mail.gmail.com ... because it has a tarball and also some patches. The new logic is that if you have exactly one tarball we'll take it, but if you have some patches and a tarball we'll take only the patches. This also moves URL classification into separate functions, which is hopefully a bit tidier than the message regexes used before. Better idea for the future suggested in comments...
1 parent 06db508 commit 2dc05a4

1 file changed

Lines changed: 48 additions & 19 deletions

File tree

cfbot_commitfest_rpc.py

Lines changed: 48 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,18 @@ def __str__(self):
3131
)
3232

3333

34+
def url_looks_like_patch(url):
35+
return "/nocfbot" not in url and re.match(
36+
r"https://.*\.(diff|patch)(\.gz|\.bz2)?$", url
37+
)
38+
39+
40+
def url_looks_like_patch_tarball(url):
41+
return "/nocfbot" not in url and re.match(
42+
r"https://.*\.(tar|tgz|tar\.gz|tar\.bz2|zip)$", url
43+
)
44+
45+
3446
def get_latest_patches_from_thread_url(thread_url):
3547
"""Given a 'whole thread' URL from the archives, find the last message that
3648
had at least one attachment called something.patch. Return the message
@@ -41,34 +53,52 @@ def get_latest_patches_from_thread_url(thread_url):
4153
message_id = None
4254
for line in cfbot_util.slow_fetch(thread_url).splitlines():
4355
groups = re.search(
44-
'<a href="(/message-id/attachment/[^"]*\\.(diff|diff\\.gz|patch|patch\\.gz|tar\\.gz|tgz|tar\\.bz2|zip))">',
56+
'<a href="(/message-id/attachment/[^"]*)">',
4557
line,
4658
)
4759
if groups:
4860
attachment = groups.group(1)
49-
if "/nocfbot" not in attachment and not attachment.endswith(
50-
"subtrans-benchmark.tar.gz"
51-
):
52-
message_attachments.append("https://www.postgresql.org" + attachment)
61+
url = "https://www.postgres.org" + attachment
62+
if url_looks_like_patch(url) or url_looks_like_patch_tarball(url):
63+
message_attachments.append(url)
5364
selected_message_attachments = message_attachments
5465
selected_message_id = message_id
5566

56-
# groups = re.search('<a name="([^"]+)"></a>', line)
67+
# start of a new message?
5768
groups = re.search('<td><a href="/message-id/[^"]+">([^"]+)</a></td>', line)
5869
if groups:
5970
message_id = groups.group(1)
6071
message_attachments = []
61-
# if there is a tarball attachment, there must be only one attachment,
62-
# otherwise give up on this thread (we don't know how to combine patches and
63-
# tarballs)
72+
6473
if selected_message_attachments is not None:
6574
if any(
66-
x.endswith(".tgz") or x.endswith(".tar.gz") or x.endswith(".tar.bz2")
67-
for x in selected_message_attachments
75+
url_looks_like_patch_tarball(url) for url in selected_message_attachments
6876
):
69-
if len(selected_message_attachments) > 1:
77+
# there is a tarball. we don't actually know if it contains any
78+
# patches (rather than, say, benchmark results). this is stupid,
79+
# but we'll try to guess...
80+
#
81+
# XXX the basic problem here is that we can't peek into the
82+
# tarballs and see if they contain patches, which is a bit sad;
83+
# perhaps we should just take everything, and teach the patch
84+
# burner script to examine everything and fail with a special
85+
# result code for 'nothing to do here' if it can't find any
86+
# patches? the point of that would be to avoid running any code
87+
# that downloads and unpacks stuff outside the container, since we
88+
# don't really have enough information here but also don't want to
89+
# touch untrusted data here
90+
if any(url_looks_like_patch(url) for url in selected_message_attachments):
91+
# mixture of tarballs and patches, keep only the patches (not
92+
# great as it would be nice to be able to post a tarball + an
93+
# extra plain patch)
94+
selected_message_attachments = list(
95+
filter(url_looks_like_patch, selected_message_attachments)
96+
)
97+
elif len(selected_message_attachments) > 1:
98+
# tarball-only, multi-tarball messages not currently supported
7099
selected_message_id = None
71100
selected_message_attachments = None
101+
72102
# if there are multiple patch files, they had better follow the convention
73103
# of leading numbers, otherwise we don't know how to apply them in the right
74104
# order
@@ -172,10 +202,9 @@ def get_current_commitfests():
172202

173203

174204
if __name__ == "__main__":
175-
# for name, cf in get_current_commitfests().items():
176-
# if not cf:
177-
# continue
178-
for sub in get_submissions_for_commitfest(55):
179-
print(str(sub))
180-
# print get_thread_url_for_submission(19, 1787)
181-
# print(get_latest_patches_from_thread_url(get_thread_url_for_submission(37, 2901)))
205+
# test case
206+
print(
207+
get_latest_patches_from_thread_url(
208+
"https://www.postgresql.org/message-id/flat/CAApHDvrF6DG7=xD8JGo2HoQKN0LRFNF0ysVt6cKSNPiqbdQOSA@mail.gmail.com"
209+
)
210+
)

0 commit comments

Comments
 (0)