-
Notifications
You must be signed in to change notification settings - Fork 51
Expand file tree
/
Copy pathconllu_sort_sentences_by_ids.pl
More file actions
executable file
·48 lines (46 loc) · 1.12 KB
/
conllu_sort_sentences_by_ids.pl
File metadata and controls
executable file
·48 lines (46 loc) · 1.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env perl
# Re-orders sentences in a CoNLL-U file by their ids. Reads the entire file
# into memory, hence it may choke on large files.
# Copyright © 2018 Dan Zeman <zeman@ufal.mff.cuni.cz>
# License: GNU GPL
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
while(<>)
{
push(@sentence, $_);
if(m/^\#\s*sent_id\s*=\s*(\S+)\s*$/)
{
$current_sent_id = $1;
}
if(m/^\s*$/)
{
if(exists($hash{$current_sent_id}))
{
die("Duplicate sentence id '$current_sent_id'");
}
$hash{$current_sent_id} = join('', @sentence);
splice(@sentence);
$current_sent_id = '';
}
}
###!!! I am creating this script because of a treebank that uses numeric ids,
###!!! so I give higher priority to numeric sorting. However, this should be
###!!! configurable as for other treebanks lexicographic sorting may be
###!!! preferable.
my @ids = sort
{
my $result = $a <=> $b;
unless($result)
{
$result = $a cmp $b;
}
$result
}
(keys(%hash));
foreach my $id (@ids)
{
print($hash{$id});
}