-
Notifications
You must be signed in to change notification settings - Fork 65
Expand file tree
/
Copy pathTextRankExtractor.cs
More file actions
107 lines (89 loc) · 3.24 KB
/
TextRankExtractor.cs
File metadata and controls
107 lines (89 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
using System.Collections.Generic;
using System.Linq;
using JiebaNet.Segmenter;
using JiebaNet.Segmenter.Common;
using JiebaNet.Segmenter.PosSeg;
namespace JiebaNet.Analyser
{
public class TextRankExtractor : KeywordExtractor
{
private static readonly IEnumerable<string> DefaultPosFilter = new List<string>()
{
"n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "v", "vd", "vg", "vi", "vn", "vq"
};
private JiebaSegmenter Segmenter { get; set; }
private PosSegmenter PosSegmenter { get; set; }
public int Span { get; set; }
public bool PairFilter(IEnumerable<string> allowPos, Pair wp)
{
return allowPos.Contains(wp.Flag)
&& wp.Word.Trim().Length >= 2
&& !StopWords.Contains(wp.Word.ToLower());
}
public TextRankExtractor(ISet<string> stopWords)
{
Span = 5;
Segmenter = new JiebaSegmenter();
PosSegmenter = new PosSegmenter(Segmenter);
SetStopWords(stopWords);
if (StopWords.IsEmpty())
StopWords.UnionWith(DefaultStopWords);
}
public override IEnumerable<string> ExtractTags(string text, int count = 20,
IEnumerable<string> allowPos = null)
{
var rank = ExtractTagRank(text, allowPos);
if (count <= 0)
{
count = 20;
}
return rank.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count);
}
public override IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20,
IEnumerable<string> allowPos = null)
{
var rank = ExtractTagRank(text, allowPos);
if (count <= 0)
{
count = 20;
}
return rank.OrderByDescending(p => p.Value).Select(p => new WordWeightPair
{
Word = p.Key, Weight = p.Value
}).Take(count);
}
#region Private Helpers
private IDictionary<string, double> ExtractTagRank(string text, IEnumerable<string> allowPos)
{
if (allowPos.IsEmpty())
allowPos = DefaultPosFilter;
var g = new UndirectWeightedGraph();
var cm = new Dictionary<string, int>();
var words = PosSegmenter.Cut(text).ToList();
for (var i = 0; i < words.Count(); i++)
{
var wp = words[i];
if (!PairFilter(allowPos, wp)) continue;
for (var j = i + 1; j < i + Span; j++)
{
if (j >= words.Count)
break;
if (!PairFilter(allowPos, words[j]))
continue;
// TODO: better separator.
var key = wp.Word + "$" + words[j].Word;
if (!cm.ContainsKey(key))
cm[key] = 0;
cm[key] += 1;
}
}
foreach (var p in cm)
{
var terms = p.Key.Split('$');
g.AddEdge(terms[0], terms[1], p.Value);
}
return g.Rank();
}
#endregion
}
}