-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
78 lines (68 loc) · 2.15 KB
/
main.py
File metadata and controls
78 lines (68 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import re
import argostranslate.package
import argostranslate.translate
import tiktoken
from fastapi import FastAPI, Form
app = FastAPI()
def preprocess_input(char: str) -> str:
"""
Cleans, translates and anonymizes the data before passing it for classification
:param char:
:return:
"""
# clean input
char = char.replace("\n", " ")
char = char.strip()
# translate
from_code = "de"
to_code = "en"
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
filter(
lambda x: x.from_code == from_code and x.to_code == to_code,
available_packages,
)
)
argostranslate.package.install_from_path(package_to_install.download())
char = argostranslate.translate.translate(char, from_code, to_code)
# anonymize email address
char = re.sub(
"(([\w-]+(?:\.[\w-]+)*)@((?:[\w-]+\.)*\w[\w-]{0,66})\.([a-z]{2,6}(?:\.[a-z]{2})?))",
"<EMAIL>",
char,
)
# anonymize person names
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp(char)
for ent in doc.ents:
if ent.label_ == "PERSON":
char = char.replace(ent.text, "<PERSON>")
# limit length
tiktoken.get_encoding("r50k_base")
encoder = tiktoken.encoding_for_model("ada")
char = encoder.decode(encoder.encode(char)[: 2048 - 9 - 1])
char = char + "\n\n###\n\n"
return char
def gpt_3_classification(char: str) -> str:
"""
Classifies the data using the GPT-3 API
:param char:
:return:
"""
import openai
import os
openai.api_key = os.getenv("OPENAI_API_KEY")
model = openai.FineTune.retrieve("ft-oGEtDF48FowWEF0mUhaDskGh")
response = openai.Completion.create(
model=model.fine_tuned_model,
prompt=char,
temperature=0,
max_tokens=1,
logprobs=2,
)
return response["choices"][0]["text"]
@app.post("/classify")
def process_strings(char: str = Form(...), kategorie: str = Form(...), unterkategorie: str = Form(...)):
return gpt_3_classification(preprocess_input(char)).strip()