-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
54 lines (46 loc) · 1.97 KB
/
main.py
File metadata and controls
54 lines (46 loc) · 1.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
import re
import json
import glob
import os
# 读取词库
with open('scope_keywords.json', 'r', encoding='utf-8') as f:
scope_keywords = json.load(f)
with open('constraint_patterns.json', 'r', encoding='utf-8') as f:
constraint_patterns = json.load(f)
# 处理单个文件
def process_file(input_file, output_csv, output_json):
df = pd.read_csv(input_file, encoding='utf-8')
df = df.iloc[:, :3]
df.columns = ["产品名称", "投资范围", "投资限制"]
def extract_scope(text):
if pd.isna(text):
return []
result = []
for label, keywords in scope_keywords.items():
if any(kw in text for kw in keywords):
result.append(label)
return result
def extract_constraints(text):
if pd.isna(text):
return {}
result = {}
for label, pattern in constraint_patterns.items():
matches = re.findall(pattern, text, flags=re.IGNORECASE)
if matches:
clean_matches = [m.strip("、;。") for m in matches]
result[label] = clean_matches if len(clean_matches) > 1 else clean_matches[0]
return result
df["整理后的投资范围"] = df["投资范围"].apply(extract_scope)
df["整理后的投资限制"] = df["投资限制"].apply(extract_constraints)
df.to_csv(output_csv, index=False, encoding='utf-8-sig')
json_data = df[["产品名称", "整理后的投资范围", "整理后的投资限制"]].to_dict(orient="records")
with open(output_json, 'w', encoding='utf-8') as f:
json.dump(json_data, f, ensure_ascii=False, indent=2)
print(f"已处理: {input_file} -> {output_csv}, {output_json}")
# 批量处理 data/ 目录下所有 input*.csv
input_files = glob.glob('data/input?.csv')
for i, input_file in enumerate(sorted(input_files), 1):
output_csv = f'data/output{i}.csv'
output_json = f'data/output{i}.json'
process_file(input_file, output_csv, output_json)