word-memorization/VocabGPT.py at main · HeMuling/word-memorization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
from openai import OpenAI
import pandas as pd
import os

def break_input(input_message:str)->list:
    '''
    args:
        - input_message: str, input message from user
                        input in form of 'instruction--task'
    return:
        - [vocab_token, type_token]: list

    break user input into ['instruction', 'task']
    '''

    input_list = input_message.split('--')

    assert len(input_list)==2, "input format incorrect"

    return input_list

def break_task(input_task_str:str)->str:
    '''
    args:
        - input_task_str: str, task name, in form of 'task:subtask'

    return:
        - task: list, [task_name, subtask_name]
    '''

    # task maybe [task, subtask] or task:str
    task = input_task_str.split(':')

    # set default setting if user ignore subtask
    if len(task)==1:
        if task[0] == 'vocab':
            task.append('exp')

    return task

def get_prompt(input_task_list:list, prompt_dict:dict)->str:
    '''
    args:
        - input_task_list: list, [task_name, subtask_name] generated by fun:break_task
        - prompt_dict: dict, set by users, in form:
                    {
                        'task name1':{
                            'universal': universal prompt,
                            'subtask_1':prompt1,
                            'subtask_2':prompt2,
                            ....
                        },
                        'task name2':{
                            'universal': universal prompt,
                        ...
                        }
                    }

    return:
        - sys_prompt: prompt for system according to task
    '''

    [task_name, subtask_name] = input_task_list

    assert task_name in prompt_dict.keys(), "task name not recognized"
    assert subtask_name in prompt_dict[task_name].keys(), "subtask name not recognized"

    sys_prompt = prompt_dict[task_name]['universal'] + prompt_dict[task_name][subtask_name]

    return sys_prompt

def response_from_gpt(user_prompt:str, sys_prompt:str)->str:
    '''
    args:
        - user_prompt: str, prompt for user input, task not included
        - sys_prompt: str, prompt for system input

    return:
        - output: str, generated content from ChatGPT
    '''

    client = OpenAI()

    response = client.chat.completions.create(
        model='gpt-4-1106-preview',
        messages=[
            {
                'role': 'system',
                'content': sys_prompt
            },
            {
                'role': 'user',
                'content': user_prompt
            }
        ],
        temperature=1,
        max_tokens=4095,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    output = response.choices[0].message.content

    return output

def get_prompt_dict(input_dict:dict=None)->str:
    '''
    args:
        - input_dict: dict, set by users, in form:
                    {
                        'task name1':{
                            'universal':universal prompt
                            'subtask_1':prompt1,
                            'subtask_2':prompt2,
                            ....
                        },
                        'task name2':{
                        ...
                        }
                    }
            if not given, use default dict inside this function

    return:
        - input_dict
    '''

    if input_dict is None:
        input_dict = {
            'vocab':{
                'universal':'''
                    扮演一位英语母语者，你的任务是帮我记忆英语单词，准备GRE与雅思的考试。\n
                    我会输入一个英语单词，并给你一个任务, 回答不要太长，要简洁、精炼。\n
                ''',
                'exp': '''
                    你需要做的是用英文分点列出单词的含义，这些含义应该来
                    自柯林斯字典等权威书籍与字典，对于每一个含义，给1-2个英语句子，句子需要满足写作
                    规范，按照GRE与雅思的写作，并给出中文翻译。\n
                    范例：\n
                    The word \"suppliant\" is often associated with a sense of deep humility,
                    earnestness, and a position of lesser power relative to the person being
                    addressed. It is both a reflection of the demeanor of the person requesting
                    and the nature of the petition being made.\n
                    1. **Meaning as an adjective**: Humbly pleading or beseeching.\n
                        - Sentence: The suppliant tone of the petitioner moved the council to consider the appeal.\n
                        - Chinese: 恳求者的恳切语气感动了委员会去考虑这项上诉。\n
                    2. **Meaning as a noun**: A person making a humble plea to someone in power or authority.\n
                        - Sentence: The king faced a group of suppliants begging for clemency outside the palace gates.\n
                        - Chinese: 国王面对一群在宫殿大门外乞求宽恕的恳求者。\n
                ''',
                'mem':'''
                    你需要帮助我对单词进行记忆。执行这个任务时需要你完成3件事：
                    1. 根据词根分解单词，并给出单词。2.给出单词的派生 3. 给出单词的来源，可以是历史上
                    的来源，也可以是给出是从哪个词衍生来的。当执行这个任务时，保证大部分都是中文。
                    范例：\n
                    1. **词根分解**：\"exotic\" 的词源可以追溯到希腊语的 \"exotikos,\" 表示 \"来自外部的\"。\"Exo\" 是一个希腊语前缀，意为 \"外面\"。\n
                    2. **派生词**： 从 \"exotic\" 派生出来的词有 \"exotica\" （指异国风情的事物集合），\"exoticism\" （对异国文化特征的迷恋或模仿）。\n
                    3. **词源**： \"exotic\" 来自希腊语 \"exotikos\"，这进一步源自 \"exo\"（外面）。这个词首先是形容词 \"exo\" 的派生，意为 \"来自远方
                    的\" 或 \"非本地的\"，延伸出我们今天所理解的 \"具有异国特色\" 或 \"外来稀奇\" 的含义。\n
                ''',
                'logic':'''
                    对于我所给的单词，你需要直接给出这个单词的含义，如果有多个含义则需要都给出，然后从逻辑上说明，多个含义是如何衍生出来的。答案以中文给出。\n
                    范例：\n
                    "Blithe" 这个词主要有两个含义：\n
                    1. 快乐的，无忧无虑的：指一个人非常快乐、心无烦恼，生活得很无忧无虑。例如，在一场欢乐的派对上，一个人可能会显得特别blithe。\n
                    2. 轻率的，不在乎的：指一个人表现得对别人或者情况没有太多关心，或者态度上显得漫不经心。例如，一个人对于重要决定的做出可能会被认为是blithe的，如果他没有认真考虑后果。\n
                    这两个含义之间是有一定逻辑联系的。如果一个人是无忧无虑的，不被生活的压力和困难所困扰，这种态度有时可能会导致他在某些情况下显得漫不经心或者轻率，因为他可能没有感受到需要更加谨慎或严肃对待的压力。
                    反之，一个态度轻率的人可能因为对事情不太上心，而显得过于无忧无虑，以至于忽视了一些重要的细节或对别人的感受。\n
                ''',
            },
        }
    else:
        pass

    return input_dict

def init_csv(task_name:str, cache_loc:str, prompt_dict:dict)->None:
    '''
    args:
        - task_name: str, task name
        - cache_loc: str, cache location, should be a folder
        - prompt_dict: dict, set by users

    this function will create a csv file a given each task
    colomn will be: 'task name', 'subtask1', 'subtask2', ...
    this happens when tried to retrive
    '''

    path = os.path.join(cache_loc, task_name + '.csv') # cache file name: task_name.csv

    # generate a cache file if one doesn't exist
    cache = pd.DataFrame()
    for subtask in prompt_dict[task_name]:
        if subtask not in cache.columns and subtask != 'universal':
            cache[subtask] = ''
    cache.to_csv(path, index=True)

def examine_csv(task_name:str, cache_loc:str, prompt_dict:dict)->None:
    '''
    args:
        - task_name: str, task name
        - cache_loc: str, cache location, should be a folder
        - prompt_dict: dict, set by users

    this function will examine if csv file contains all needed subtaskes
    used if new subtask is added
    this happens when tried to cache output
    '''
    path = os.path.join(cache_loc, task_name + '.csv') # cache file name: task_name.csv

    # get colomn name in csv
    cache = pd.read_csv(path, index_col=0)
    column_name = cache.columns

    # get subtask name in prompt_dict
    subtask_name = [subname for subname in prompt_dict[task_name].keys() if subname != 'universal']

    # check and get missing subtask
    missing_subtask = [subname for subname in subtask_name if subname not in column_name]

    # add missing subtask to csv
    if len(missing_subtask) > 0:
        for subtask in missing_subtask:
            cache[subtask] = ''
        cache.to_csv(path, index=True)
    else:
        pass


def cache_output(instruction:str, output:str, cache_loc:str,
                 task_name:str, subtask_name:str,
                 prompt_dict:dict)->None:
    '''
    args:
        - instruction: instruction from user
                    e.g. a word if task=='vocab'
        - output: str, generated content from ChatGPT
        - cache_loc: str, cache location, should be a folder
        - task_name: str, task name
        - subtask_name: str, subtask name
        - prompt_dict: dict, set by users

    this function will cache output into csv file
    '''

    path = os.path.join(cache_loc, task_name + '.csv') # cache file name: task_name.csv

    # examine if csv file contains all needed subtaskes
    examine_csv(task_name, cache_loc, prompt_dict)
    # read cache file
    cache = pd.read_csv(path, index_col=0)
    # input data
    cache.loc[instruction, subtask_name] = output
    # store to .csv
    cache.to_csv(path, index=True)

def retrive(instruction:str,
            cache_loc:str, task_name:str,
            subtask_name:str, prompt_dict:dict)->str or None:
    '''
    args:
        - instruction: instruction from user
                    e.g. a word if task=='vocab'
        - cache_loc: str, cache location, should be a folder
        - task_name: str, task name
        - subtask_name: str, subtask name

    return:
        - output or None

    depending on if instruction is inside a local file
    return the local output or None
    also check if cache file exists, if not, create one
    '''

    path = os.path.join(cache_loc, task_name + '.csv')
    # generate a cache file if one doesn't exist
    if not os.path.exists(path):
        init_csv(task_name, cache_loc, prompt_dict)
    cache = pd.read_csv(path, index_col=0)
    if instruction in cache.index:
        # get output from cache
        output = cache.loc[instruction, subtask_name]
        # check if output has value, if value is nan, return None
        if str(output) != 'nan':
            return output
        else:
            return None
    else:
        return None

def main():
    '''
    1. get prompt dict & cache location (folder)
    2. get user input
    3. check if input exists in cache
    4.
        - if not, generate output from ChatGPT, cache output
        - if yes, read output from cache
    5. print output
    '''
    continue_token = True
    prompt_dict = get_prompt_dict()
    cache_loc = os.path.join('cache')
    while continue_token:
        input_message = input("请输入查询单词，格式为\"[instruction]--[任务类名：任务名]\"：")
        if input_message == '!quit':
            continue_token = False
        else:
            [instruction, task] = break_input(input_message)
            [task_name, subtask_name] = break_task(task)
            sys_prompt = get_prompt([task_name, subtask_name], prompt_dict)
            output = retrive(instruction, cache_loc, task_name, subtask_name, prompt_dict)
            if output is None:
                output = response_from_gpt(instruction, sys_prompt)
                cache_output(instruction, output, cache_loc, task_name, subtask_name, prompt_dict)
            print(output)

if __name__ == '__main__':
    main()