AI/shuffle2_gen.py at main · Convergence-Project/AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas as pd
import nltk
# from transformers import BertTokenizer ##버트 토크나이저 임포트
from nltk.tokenize import word_tokenize, sent_tokenize ##문장 토크나이저
# from transformers import pipeline
import random
from nltk.tokenize import word_tokenize
import numpy as np
import itertools
from googletrans import Translator
import os

translator = Translator() ##구글 번역기 객체 생성
##사용할 본문

def shuffle2_gen(num, file_path, output_path):

    df = pd.read_csv(file_path)

    class_list = [] #문제유형
    main_list = [] #본문
    view1 = []  #보기1
    view2 = []  #보기2
    view3 = []  #보기3
    view4 = []  #보기4
    view5 = []  #보기5
    awr = []    #정답번호
    commentary = [] #해설

    for count in range(len(df[:num])):
        main = df['content'][count].strip()
        if len(main)>2500:
            continue
        else:
            try:
                main = df['content'][count].strip()
                sentences = sent_tokenize(df['content'][count])

                sentence_num_of_answer = random.randint(0,len(sentences)-1) ##정답 문장 순번
                answer_sentence = sentences.pop(sentence_num_of_answer) ##정답 문장


                options = []
                odd_list = [i for i in range(1,(len(sentences)*2)+1,2)]
                even_list = [i for i in range(0,(len(sentences)*2)+1,2)]
                random.shuffle(odd_list)
                random.shuffle(even_list)

                ##경우의 수 [1, ~~~~] , [2,~~~]


                if sentence_num_of_answer == len(sentences): ##마지막 문장일 경우
                    options.append(len(sentences)*2)
                else:
                    options.append(random.choice((sentence_num_of_answer*2, sentence_num_of_answer*2+1)))

                while len(options)!=5:

                    if len(options)==0 :
                        options.append(random.randint(1,len(sentences)*2))
                    else:
                        if options[0]%2==1:
                            num = odd_list.pop()
                            if num not in options:
                                options.append(num)
                        else:
                            num = even_list.pop()
                            if num not in options:
                                options.append(num)
                            # print("넣었다")
                            # print(options)
                    if (0 in options) and (2 in options):
                        options.remove(2)

                    options = sorted(options) ##정렬


                if sentence_num_of_answer*2 in options :
                    answer_num = sentence_num_of_answer*2
                elif (sentence_num_of_answer*2)+1 in options:
                    answer_num = (sentence_num_of_answer*2)+1


                residual = []
                num_list = sorted([i for i in range(1,(len(sentences)*2)+1)], reverse=True)


                for i in sentences:
                    i = f'({num_list.pop()}) '+ i +f' ({num_list.pop()})'
                    residual.append(i)


                res = []
                for i in options: ##정답번호 추출
                    if i%2==0 :
                        if i ==0:
                            i_cop = residual[0].replace('(2)',"")
                            res.append(i_cop)
                        else:
                            i_cop = residual[(i//2)-1].replace((f'({i-1})'),'')
                            res.append(i_cop)
                    elif i%2==1 :
                        i_cop = residual[i//2].replace((f'({i+1})'),"")
                        res.append(i_cop)

                res = ''.join(res)
                for i in range(5):
                    res = res.replace(f'({options[i]})', f'({i+1})')

                if (sentence_num_of_answer*2 == answer_num) or (sentence_num_of_answer*2+1 == answer_num) and (answer_num in options):
                    print(answer_num, options)
                    print(sentence_num_of_answer)
                    print("정답: ", options.index(answer_num)+1)

                    outStr = translator.translate(main, dest='ko', src='en')

                    class_list.append('shuffle2')
                    main_list.append(main)
                    view1.append('1')
                    view2.append('2')
                    view3.append('3')
                    view4.append('4')
                    view5.append('5')
                    awr.append(str(options.index(answer_num)+1))
                    commentary.append(outStr.text)

                        # result = {"Q{}".format(count+1) :
                        #     {"main" : main[:-1], ##개행문자 제거
                        #     "view1": '-'.join(answer_list[0]),
                        #     "view2": '-'.join(answer_list[1]),
                        #     "view3": '-'.join(answer_list[2]),
                        #     "view4": '-'.join(answer_list[3]),
                        #     "view5": '-'.join(answer_list[4]),
                        #     "answer": answer_list.index(answer)+1,
                        #     "commentary": outStr.text}}
                    print(count, "번째 작업중")
            except:
                pass

    js_dict = {}
    js_dict['class'] = class_list
    js_dict['main'] = main_list
    js_dict['view1'] = view1
    js_dict['view2'] = view2
    js_dict['view3'] = view3
    js_dict['view4'] = view4
    js_dict['view5'] = view5
    js_dict['answer'] = awr
    js_dict['commentary'] = commentary

    tmp = pd.DataFrame.from_dict(js_dict)


    ##저장
    if not os.path.exists(output_path):
        tmp.to_csv(output_path, index=False, mode='w', encoding='utf-8-sig')
    else:
        tmp.to_csv(output_path, index=False, mode='a', encoding='utf-8-sig', header=False)