-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathAutoCrawlNews.py
More file actions
48 lines (42 loc) · 2.02 KB
/
AutoCrawlNews.py
File metadata and controls
48 lines (42 loc) · 2.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import argparse
import pdb
import sys
import ConvertHtmlToText
from CrawlWebSite import BaseWebsite
def parse_arguments(argv):
parser = argparse.ArgumentParser()
parser.add_argument('-web', type=str, help='vov, vnanet, vietnamplus, qdnd, vietlao',default=1)
parser.add_argument('-tgt_lang', type=str, help='Source Language.',default="lo")
parser.add_argument('-lim_sorce', type=str, help='Source Language.', default="lo")
return parser.parse_args(argv)
# python AutoCrawlNews.py -web vietlao -tgt_lang lo
# python AutoCrawlNews.py -web vov -tgt_lang lo
if __name__ == '__main__':
parser = parse_arguments(sys.argv[1:])
if not parser.web:
print("Hay nhap trang web")
exit()
if parser.web == "vov":
web = BaseWebsite("Vov", "VovCrawler", ["en", "ja", "km", "zh", "lo", "vi"])
web.auto_crawl_website(parser.tgt_lang)
if parser.web == "vietlao":
web = BaseWebsite("VietLao", "VietLaoVietNamCrawler", ["lo", "vi"])
web.auto_crawl_website(parser.tgt_lang)
if parser.web == "vnanet":
web = BaseWebsite("Vnanet", "VnanetCrawler", ["vi", "lo", "zh", "en","km"])
web.auto_crawl_website(parser.tgt_lang)
if parser.web == "vietnamplus":
web = BaseWebsite("VietNamPlus", "VietNamPlusCrawler", ["vi", "zh", "en"])
web.auto_crawl_website(parser.tgt_lang, type="date")
if parser.web == "qdnd":
web = BaseWebsite("QDND", "QDNDCrawler", ["lo", "vi", "zh", "en","km"])
web.auto_crawl_website(parser.tgt_lang, type="title")
if parser.web == "tapchicongsan":
web = BaseWebsite("TapchiCongSan", "TapChiCongSanCrawler", ["lo", "vi", "zh", "en"])
web.auto_crawl_website(parser.tgt_lang, type="title")
if parser.web == "nhandan":
web = BaseWebsite("NhanDan", "NhanDanCrawler", ["vi", "zh",'en'])
web.auto_crawl_website(parser.tgt_lang, type="title")
if parser.web == "bcc":
web = BaseWebsite("BCC", "BCCCrawler", ["vi", "zh"])
web.auto_crawl_website(parser.tgt_lang, type="title")