diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..19ac515 --- /dev/null +++ b/.env.example @@ -0,0 +1,43 @@ +# MySQL +DB_HOST= +DB_PORT=3306 +DB_NAME= +DB_USER= +DB_PASSWORD= + +# MongoDB +MONGO_HOST= +MONGO_PORT=27017 +MONGO_DB= +MONGO_USER= +MONGO_PASSWORD= + +# Redis +REDIS_HOST= +REDIS_PORT=0 +REDIS_DB= +REDIS_PASSWORD= + +# Koreatech Portal +PORTAL_ID= +PORTAL_PW= +PORTAL_IP= + +# Gmail +GMAIL_ID= +GMAIL_PW= + +# Slack +SLACK_WEBHOOK_URL= + +# AWS S3 +S3_ACCESS_KEY_ID= +S3_SECRET_ACCESS_KEY= +S3_BUCKET= +S3_UPLOAD_DOMAIN= + +# Batch API +BATCH_EMAIL= +BATCH_PASSWORD= +BATCH_TOKEN_URL= +BATCH_NOTIFICATION_API_URL= diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml new file mode 100644 index 0000000..21b0162 --- /dev/null +++ b/.github/workflows/deploy-dev.yml @@ -0,0 +1,83 @@ +name: KOIN_BATCH CD (develop) + +on: + push: + branches: + - develop + +jobs: + deploy: + runs-on: ubuntu-latest + environment: development + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Record start time + run: echo "START_TIME=$(date +%s)" >> $GITHUB_ENV + + - name: Notify Slack - Deploy Start + env: + ACTIONS_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + COMMIT_MSG=$(git log -1 --pretty=%s HEAD) + curl -X POST ${{ secrets.SLACK_DEPLOY_WEBHOOK_URL }} \ + -H 'Content-Type: application/json' \ + -d "{ + \"text\": \":rocket: *[Develop] KOIN_BATCH 배포 시작*\n• Repo: ${{ github.repository }}\n• Branch: develop\n• Author: @${{ github.actor }}\n• Commit: ${COMMIT_MSG}\n• <${ACTIONS_URL}|Actions 보기>\" + }" + + - name: Create archive + run: tar -cvzf batch.tar.gz --exclude='.git' --exclude='.github' . + + - name: Transfer archive to server + uses: appleboy/scp-action@v0.1.7 + with: + host: ${{ secrets.SSH_HOST }} + username: ${{ secrets.SSH_USER }} + key: ${{ secrets.SSH_KEY }} + port: ${{ secrets.SSH_PORT }} + source: batch.tar.gz + target: ${{ secrets.DEPLOY_PATH }} + + - name: Execute deploy commands + uses: appleboy/ssh-action@v1 + with: + host: ${{ secrets.SSH_HOST }} + username: ${{ secrets.SSH_USER }} + key: ${{ secrets.SSH_KEY }} + port: ${{ secrets.SSH_PORT }} + script: | + source ~/.profile + cd ${{ secrets.DEPLOY_PATH }} + tar -xvzf batch.tar.gz + rm batch.tar.gz + chmod +x run.sh + pip3 install -r requirements.txt + + cp -n crawling/config.example.py crawling/config.py + cp -n crawling/koreatech_article/config.example.py crawling/koreatech_article/config.py + cp -n crawling/koreatech_portal/config.example.py crawling/koreatech_portal/config.py + cp -n crawling/city_bus/config.example.py crawling/city_bus/config.py + + - name: Notify Slack - Deploy Result + if: always() + env: + ACTIONS_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + COMMIT_MSG=$(git log -1 --pretty=%s HEAD) + DURATION_SEC=$(( $(date +%s) - START_TIME )) + DURATION="${DURATION_SEC}초 (약 $(( DURATION_SEC / 60 ))분)" + if [ "${{ job.status }}" = "success" ]; then + ICON=":white_check_mark:" + STATUS="배포 성공" + else + ICON=":x:" + STATUS="배포 실패" + fi + curl -X POST ${{ secrets.SLACK_DEPLOY_WEBHOOK_URL }} \ + -H 'Content-Type: application/json' \ + -d "{ + \"text\": \"${ICON} *[Develop] KOIN_BATCH ${STATUS}*\n• Repo: ${{ github.repository }}\n• Branch: develop\n• Author: @${{ github.actor }}\n• Commit: ${COMMIT_MSG}\n• Duration: ${DURATION}\n• <${ACTIONS_URL}|Actions 보기>\" + }" diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml new file mode 100644 index 0000000..308efca --- /dev/null +++ b/.github/workflows/deploy-prod.yml @@ -0,0 +1,81 @@ +name: KOIN_BATCH CD (production) + +on: + workflow_dispatch: + +jobs: + deploy: + runs-on: ubuntu-latest + environment: production + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Record start time + run: echo "START_TIME=$(date +%s)" >> $GITHUB_ENV + + - name: Notify Slack - Deploy Start + env: + ACTIONS_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + COMMIT_MSG=$(git log -1 --pretty=%s HEAD) + curl -X POST ${{ secrets.SLACK_DEPLOY_WEBHOOK_URL }} \ + -H 'Content-Type: application/json' \ + -d "{ + \"text\": \":rocket: *[Production] KOIN_BATCH 배포 시작*\n• Repo: ${{ github.repository }}\n• Branch: master\n• Author: @${{ github.actor }}\n• Commit: ${COMMIT_MSG}\n• <${ACTIONS_URL}|Actions 보기>\" + }" + + - name: Create archive + run: tar -cvzf batch.tar.gz --exclude='.git' --exclude='.github' . + + - name: Transfer archive to server + uses: appleboy/scp-action@v0.1.7 + with: + host: ${{ secrets.SSH_HOST }} + username: ${{ secrets.SSH_USER }} + key: ${{ secrets.SSH_KEY }} + port: ${{ secrets.SSH_PORT }} + source: batch.tar.gz + target: ${{ secrets.DEPLOY_PATH }} + + - name: Execute deploy commands + uses: appleboy/ssh-action@v1 + with: + host: ${{ secrets.SSH_HOST }} + username: ${{ secrets.SSH_USER }} + key: ${{ secrets.SSH_KEY }} + port: ${{ secrets.SSH_PORT }} + script: | + source ~/.profile + cd ${{ secrets.DEPLOY_PATH }} + tar -xvzf batch.tar.gz + rm batch.tar.gz + chmod +x run.sh + pip3 install -r requirements.txt + + cp -n crawling/config.example.py crawling/config.py + cp -n crawling/koreatech_article/config.example.py crawling/koreatech_article/config.py + cp -n crawling/koreatech_portal/config.example.py crawling/koreatech_portal/config.py + cp -n crawling/city_bus/config.example.py crawling/city_bus/config.py + + - name: Notify Slack - Deploy Result + if: always() + env: + ACTIONS_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + COMMIT_MSG=$(git log -1 --pretty=%s HEAD) + DURATION_SEC=$(( $(date +%s) - START_TIME )) + DURATION="${DURATION_SEC}초 (약 $(( DURATION_SEC / 60 ))분)" + if [ "${{ job.status }}" = "success" ]; then + ICON=":white_check_mark:" + STATUS="배포 성공" + else + ICON=":x:" + STATUS="배포 실패" + fi + curl -X POST ${{ secrets.SLACK_DEPLOY_WEBHOOK_URL }} \ + -H 'Content-Type: application/json' \ + -d "{ + \"text\": \"${ICON} *[Production] KOIN_BATCH ${STATUS}*\n• Repo: ${{ github.repository }}\n• Branch: master\n• Author: @${{ github.actor }}\n• Commit: ${COMMIT_MSG}\n• Duration: ${DURATION}\n• <${ACTIONS_URL}|Actions 보기>\" + }" diff --git a/.gitignore b/.gitignore index 798949d..d25c686 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.env +!.env.example .vscode crawling/__pycache__ crawling/config.py diff --git a/crawling/city_bus/config.example.py b/crawling/city_bus/config.example.py index d489aa3..f7dea69 100644 --- a/crawling/city_bus/config.example.py +++ b/crawling/city_bus/config.example.py @@ -1,7 +1,11 @@ -MONGO_CONFIG = { - 'host': '', - 'port': '', - 'db': '', - 'user': '', - 'password': '' -} +import importlib.util +import os + +_parent_config = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'config.py') +_spec = importlib.util.spec_from_file_location('_central_config', _parent_config) +_mod = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_mod) + +for _name in dir(_mod): + if not _name.startswith('_'): + globals()[_name] = getattr(_mod, _name) diff --git a/crawling/config.example.py b/crawling/config.example.py index 85db0ef..f586094 100644 --- a/crawling/config.example.py +++ b/crawling/config.example.py @@ -1,19 +1,60 @@ +import os +from pathlib import Path + +from dotenv import load_dotenv + +load_dotenv(Path(__file__).resolve().parent.parent / '.env') + DATABASE_CONFIG = { - 'host': '', - 'db': '', - 'user': '', - 'password': '', - 'port': 3306 + 'host': os.getenv('DB_HOST', ''), + 'port': int(os.getenv('DB_PORT', '3306')), + 'db': os.getenv('DB_NAME', ''), + 'user': os.getenv('DB_USER', ''), + 'password': os.getenv('DB_PASSWORD', ''), +} + +MYSQL_CONFIG = DATABASE_CONFIG + +MONGO_CONFIG = { + 'host': os.getenv('MONGO_HOST', ''), + 'port': int(os.getenv('MONGO_PORT', '27017')), + 'db': os.getenv('MONGO_DB', ''), + 'user': os.getenv('MONGO_USER', ''), + 'password': os.getenv('MONGO_PASSWORD', ''), +} + +REDIS_CONFIG = { + 'host': os.getenv('REDIS_HOST', ''), + 'port': int(os.getenv('REDIS_PORT', '0')), + 'db': os.getenv('REDIS_DB', ''), + 'password': os.getenv('REDIS_PASSWORD', ''), +} + +PORTAL_CONFIG = { + 'id': os.getenv('PORTAL_ID', ''), + 'pw': os.getenv('PORTAL_PW', ''), + 'ip': os.getenv('PORTAL_IP', ''), +} + +GMAIL_CONFIG = { + 'id': os.getenv('GMAIL_ID', ''), + 'pw': os.getenv('GMAIL_PW', ''), } SLACK_CONFIG = { - 'url': '' + 'url': os.getenv('SLACK_WEBHOOK_URL', ''), } -MONGO_CONFIG = { - 'host': '', - 'port': '', - 'db': '', - 'user': '', - 'password': '' +S3_CONFIG = { + 'aws_access_key_id': os.getenv('S3_ACCESS_KEY_ID', ''), + 'aws_secret_access_key': os.getenv('S3_SECRET_ACCESS_KEY', ''), + 'bucket': os.getenv('S3_BUCKET', ''), + 'upload_domain': os.getenv('S3_UPLOAD_DOMAIN', ''), +} + +BATCH_CONFIG = { + 'email': os.getenv('BATCH_EMAIL', ''), + 'password': os.getenv('BATCH_PASSWORD', ''), + 'token_url': os.getenv('BATCH_TOKEN_URL', ''), + 'notification_api_url': os.getenv('BATCH_NOTIFICATION_API_URL', ''), } diff --git a/crawling/dcinside_gallery.py b/crawling/dcinside_gallery.py deleted file mode 100644 index 59f1b8d..0000000 --- a/crawling/dcinside_gallery.py +++ /dev/null @@ -1,131 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import re -from urllib.parse import urlparse, parse_qs -import pymysql -import json -import config - -def connect_db(): - conn = pymysql.connect(host=config.DATABASE_CONFIG['host'], - port=config.DATABASE_CONFIG['port'], - user=config.DATABASE_CONFIG['user'], - password=config.DATABASE_CONFIG['password'], - db=config.DATABASE_CONFIG['db'], - charset='utf8') - return conn - -noticeIds = { - "18":"CA001" -} - -tags = { - "CA001": "디씨크롤링" -} - -def crawling(noticeId, ls=10): - nas = [] - tag = noticeIds[noticeId] - boardId = getBoardId(tag) - - host = "https://gall.dcinside.com" - - url = host + "/mgallery/board/lists/?id=koreatech&page=1" - html = requests.get(url) - soup = BeautifulSoup(html.text, "html.parser") - trs = soup.select('#container > section.left_content > article:nth-child(3) > div.gall_listwrap.list > table > tbody > tr') - - for tr in trs: - td = tr.select('td') - # author - author = td[2].text.split('(')[0].lstrip('\n') - # title - title = td[1].text.split('\n')[1] - # permalink - permalink = host+td[1].find('a').get('href') - parsed_url = urlparse(permalink) - qs = parse_qs(parsed_url.query) - articleNum = qs.get('no')[0] - - na = DcArticle(boardId, title, author, articleNum, permalink) - setContent(na) - - nas.append(na) - - print('updating... %s %s' % (tag, str(articleNum))) - - updateDB(nas) - - pass - -def setContent(na): - html = requests.get(na.permalink) - soup = BeautifulSoup(html.text, "html.parser") - - content = soup.find('div', {'style':'overflow:hidden;'}) - content = str(content).replace('src="//', 'src="https://') - content = str(content).replace('href="//', 'href="https://') - content = re.sub("()", "", str(content)) - - registered_at = soup.find('span', {'class':'gall_date'}).get('title') - - na.content = content - na.registered_at = registered_at - pass - -def getBoardId(tag): - sql = "SELECT id FROM koin.boards WHERE tag = '%s'" - cur = connection.cursor() - cur.execute(sql % tag) - rows = cur.fetchall() - return rows[0][0] # db에 있는 boards의 id - -def updateDB(nas): - cur = connection.cursor() - - #추후 디씨 크롤링 등록시 sql문 테이블 변경 필요 - for na in nas: - na.content = na.content.replace("'","""''""") #sql문에서 작은따옴표 이스케이프 처리 - try: - sql = "INSERT INTO koin.notice_articles(board_id, title, content, author, hit, is_deleted, article_num, permalink, has_notice, registered_at) \ - VALUES (%d, '%s', '%s', '%s', %d, %d, %d, '%s', %d, '%s') \ - ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), board_id = %d, article_num = %d" - - cur.execute(sql % (na.board_id, na.title, na.content, na.author, na.hit, na.is_deleted, int(na.article_num), na.permalink, na.has_notice, na.registered_at, na.board_id, int(na.article_num))) - - newNoticeId = cur.lastrowid - - meta = json.dumps({"registered_at": na.registered_at, "permalink": na.permalink}) - - sql = "INSERT INTO koin.articles(board_id, title, nickname, content, user_id, ip, meta, is_notice, created_at, notice_article_id) \ - VALUES (%d, '%s', '%s', '%s', %d, '%s', '%s', %d, '%s', %d) \ - ON DUPLICATE KEY UPDATE board_id = %d, notice_article_id = %d" - - cur.execute(sql % (na.board_id, na.title, na.author, na.content, 0, "127.0.0.1", meta, 1, na.registered_at, newNoticeId, na.board_id, newNoticeId)) - connection.commit() - - except Exception as error: - connection.rollback() - print(error) - -class DcArticle: - def __init__(self, boardId, title, author, articleNum, permalink): - self.board_id = boardId - self.title = title - self.content = None - self.author = author - self.hit = 0 - self.is_deleted = 0 - self.has_notice = 0 - self.article_num = articleNum - self.permalink = permalink - self.registered_at = None - pass - - -if __name__ == "__main__": - # execute only if run as a script - connection = connect_db() - for noticeId in noticeIds.keys(): - crawling(noticeId) - connection.close() \ No newline at end of file diff --git a/crawling/get_holiday.py b/crawling/get_holiday.py deleted file mode 100644 index f686eab..0000000 --- a/crawling/get_holiday.py +++ /dev/null @@ -1,71 +0,0 @@ -from bs4 import BeautifulSoup -import requests -from datetime import datetime -import urllib3 -import pymysql -import config - - -def connect_db(): - urllib3.disable_warnings() - conn = pymysql.connect(host=config.DATABASE_CONFIG['host'], - port=config.DATABASE_CONFIG['port'], - user=config.DATABASE_CONFIG['user'], - password=config.DATABASE_CONFIG['password'], - db=config.DATABASE_CONFIG['db'], - charset='utf8') - return conn - - -def crawling(): - now = datetime.now() - authorize_key = 'tooGWOzbehkPmBairI8NF5qHCgPMkE7cFrHNNKRiqLBeC4Pyy7paCQbEeV0Xgt2vBp2YUWGlSxpHuc6vkcAlIQ%3D%3D' - - for i in range(1, 13): - url = "http://apis.data.go.kr/B090041/openapi/service/SpcdeInfoService/getRestDeInfo?serviceKey=%s&solYear=%d" \ - "&solMonth=%02d" % (authorize_key, now.year, i) - request = requests.get(url) - request.encoding = 'UTF-8' - - soup = BeautifulSoup(request.content, features="html.parser") - - header = soup.find('header') - if header is None: # 에러 시 넘김 - print("%d월 에러 발생: None" % i) - continue - - resultCode = header.find('resultcode') - resultMessage = header.resultmsg - - if resultCode.text != '00': # 정상 코드가 아니면 넘김 - print("%d월 에러 발생: %s" % (i, resultMessage.text)) - continue - - body = soup.body - items = body.findAll('item') - for item in items: - name = item.datename.text - date = item.locdate.text - # print(name, date) - updateDB(name, date) - pass - - -def updateDB(name, date): - cur = connection.cursor() - try: - sql = "INSERT INTO koin.holidays(NAME, DATE) VALUES ('%s', '%s')" - print(sql % (name, date)) - - cur.execute(sql % (name, date)) - connection.commit() - - except Exception as error: - connection.rollback() - print(error) - - -if __name__ == "__main__": - connection = connect_db() - crawling() - connection.close() diff --git a/crawling/graduation_credit_calculator/culture_hrd_msc.py b/crawling/graduation_credit_calculator/culture_hrd_msc.py deleted file mode 100644 index f604d14..0000000 --- a/crawling/graduation_credit_calculator/culture_hrd_msc.py +++ /dev/null @@ -1,234 +0,0 @@ -import pdfplumber -import pandas as pd -from sqlalchemy import create_engine, text -import logging -import config - -# 로깅 설정 -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") - -# 연도별 키워드 설정 (찾아야하는 표 바로 위의 키워드), 이전 연도 데이터는 주석 처리하고, 실제로 사용할 연도만 활성화 -YEARLY_KEYWORDS = { - 2019: ["가. 교양", "가. HRD학과", "나. MSC"], - 2020: ["❑ 교양 교과목표", "❑ HRD학과 교과목표", "❑ MSC 교과목표"], - 2021: ["❑ 교양 교과목표", "❑ HRD학과 교과목표", "❑ MSC 교과목표"], - 2022: ["❑ 교양 교과목", "❑ HRD학과 교과목", "❑ MSC 교과목"], - 2023: ["❑ 교양 교과목", "❑ HRD학과 교과목", "❑ MSC 교과목"], - 2024: ["❑ 교양 교과목", "❑ HRD학과 교과목", "❑ 수리적사고 교과목"], -} - -# 기준 헤더 - pdf에서 기준 헤더를 포함하는 표를 찾음 -TARGET_HEADER = ["교과목코드", "교과목명", "학-강-실-설", "이수구분"] - -def is_similar_header(cleaned_header, target_header): - def matches_target(cleaned_col, target_col): - if target_col in ["학-강-실", "학-강-실-설"]: - return "학-강-실" in cleaned_col or "학-강-실-설" in cleaned_col - return target_col in cleaned_col - - for target_col in target_header: - if not any(matches_target(cleaned_col, target_col) for cleaned_col in cleaned_header): - return False - return True - -def clean_header(header): - return [col.replace("\n", "").strip() for col in header if col and col.strip()] - -def extract_and_merge_tables(pdf_path, keyword): - """PDF에서 특정 키워드를 포함하는 테이블을 추출하고 병합""" - merged_table = [] - header = None - is_table_continued = False - found_table_for_keyword = False - final_target_header = TARGET_HEADER[:] - - try: - with pdfplumber.open(pdf_path) as pdf: - for i, page in enumerate(pdf.pages): - text = page.extract_text() - first_table = True # 첫 번째 표 여부 확인 - cleaned_text = " ".join(text.split()) if text else "" - cleaned_keyword = " ".join(keyword.split()) - - if cleaned_keyword in cleaned_text or is_table_continued: - logging.info(f"키워드 '{keyword}'를 페이지 {i + 1}에서 찾음") - - tables = page.extract_tables() - is_multiple_tables = len(tables) > 1 - - if tables: - for table in tables: - if table: - current_header = clean_header(table[0]) - - if is_similar_header(current_header, final_target_header): - if "영역" in current_header and "영역" not in final_target_header: - final_target_header.append("영역") # "영역" 컬럼이 있으면 추가 - - if not header: - header = current_header - - # 필요한 컬럼 인덱스 매핑 - target_indices = [ - current_header.index(col) - if col in current_header else current_header.index("학-강-실") - for col in final_target_header - if col in current_header or col == "학-강-실-설" and "학-강-실" in current_header - ] - - # 테이블 데이터 필터링 - filtered_table = [ - [row[idx] if idx < len(row) else None for idx in target_indices] for row in table[1:] - ] - merged_table.extend(filtered_table) - found_table_for_keyword = True - is_table_continued = True - - if first_table and is_multiple_tables: - is_table_continued = False - break - - else: - is_table_continued = False - first_table = False - - else: - is_table_continued = False - - if not is_table_continued and found_table_for_keyword: - break - - except Exception as e: - logging.error(f"PDF 처리 중 오류 발생: {e}") - - return merged_table if merged_table else None, final_target_header - -def process_table_data(merged_table, target_header): - """병합된 테이블 데이터를 DataFrame으로 변환""" - try: - if not merged_table: - raise ValueError("유효하지 않은 테이블 데이터") - - df = pd.DataFrame(merged_table, columns=target_header) - df.dropna(subset=["교과목코드", "교과목명"], inplace=True) - - # 학점 정보 추출 - df['credit'] = df['학-강-실-설'].apply(lambda x: int(x.split('-')[0]) if x and '-' in x else 0) - - logging.info(f"DataFrame 생성 완료. {len(df)}개 레코드 처리") - return df - except Exception as e: - logging.error(f"데이터 처리 중 오류 발생: {e}") - raise - -def create_engine_connection(): - try: - db_config = config.DATABASE_CONFIG - engine = create_engine( - f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['db']}" - ) - logging.info("데이터베이스 연결 성공") - return engine - except Exception as e: - logging.error(f"데이터베이스 연결 실패: {e}") - raise - -def insert_data_to_db(df, engine, year): - def get_or_create_department(conn): - """department 테이블에서 학과 ID 조회 후 없으면 생성""" - department_name = "학과공통" - result = conn.execute(text("SELECT id FROM department WHERE name = :name"), {"name": department_name}).fetchone() - if result: - return result[0] - conn.execute(text("INSERT INTO department (name) VALUES (:name)"), {"name": department_name}) - return conn.execute(text("SELECT LAST_INSERT_ID()")).fetchone()[0] - - def get_or_create_course_type_id(course_type_name, conn): - """course_type 테이블에서 ID 조회 후 없으면 생성""" - if not course_type_name.strip(): - return None - if course_type_name == "중등교직": - course_type_name = "자유선택" - result = conn.execute(text("SELECT id FROM course_type WHERE name = :name"), {"name": course_type_name}).fetchone() - if result: - return result[0] - conn.execute(text("INSERT INTO course_type (name) VALUES (:name)"), {"name": course_type_name}) - return conn.execute(text("SELECT LAST_INSERT_ID()")).fetchone()[0] - - def get_or_create_area_id(area_name, conn): - """general_education_area 테이블에서 ID 조회 후 없으면 생성""" - if not area_name or not area_name.strip(): - return None - result = conn.execute(text("SELECT id FROM general_education_area WHERE name = :name"), {"name": area_name}).fetchone() - if result: - return result[0] - conn.execute(text("INSERT INTO general_education_area (name) VALUES (:name)"), {"name": area_name}) - return conn.execute(text("SELECT LAST_INSERT_ID()")).fetchone()[0] - - try: - with engine.begin() as conn: - department_id = get_or_create_department(conn) - - for _, row in df.iterrows(): - area_id = get_or_create_area_id(row['영역'], conn) if "영역" in df.columns else None - course_type_id = get_or_create_course_type_id(row['이수구분'], conn) - - # 중복 확인 - existing = conn.execute( - text( - """ - SELECT id FROM catalog - WHERE year = :year - AND department_id = :department_id - AND code = :code - AND lecture_name = :lecture_name - AND credit = :credit - AND course_type_id = :course_type_id - AND general_education_area_id = :general_education_area_id - """ - ), - { - "year": year, - "department_id": department_id, - "code": row['교과목코드'], - "lecture_name": row['교과목명'], - "credit": row['credit'], - "course_type_id": course_type_id, - "general_education_area_id": area_id - } - ).fetchone() - - if existing: - continue - - conn.execute( - text(""" - INSERT INTO catalog (year, code, lecture_name, department_id, credit, course_type_id, general_education_area_id) - VALUES (:year, :code, :lecture_name, :department_id, :credit, :course_type_id, :area_id) - """), - {"year": year, "code": row['교과목코드'], "lecture_name": row['교과목명'], - "department_id": department_id, "credit": row['credit'], "course_type_id": course_type_id, - "area_id": area_id} - ) - - except Exception as e: - logging.error(f"데이터베이스 삽입 중 오류 발생: {e}") - raise - - -if __name__ == "__main__": - engine = create_engine_connection() - - for year, keywords in YEARLY_KEYWORDS.items(): - logging.info(f"==== {year}년 데이터 처리 시작 ====") - pdf_path = f"./pdfs/{year}대학요람.pdf" - - for keyword in keywords: - logging.info(f"키워드 '{keyword}' 처리 중...") - merged_table, final_target_header = extract_and_merge_tables(pdf_path, keyword) - - if merged_table: - df = process_table_data(merged_table, final_target_header) - insert_data_to_db(df, engine, year) - - logging.info("모든 데이터 삽입 완료.") \ No newline at end of file diff --git a/crawling/graduation_credit_calculator/major_subject.py b/crawling/graduation_credit_calculator/major_subject.py deleted file mode 100644 index d582040..0000000 --- a/crawling/graduation_credit_calculator/major_subject.py +++ /dev/null @@ -1,342 +0,0 @@ -import re -import pdfplumber -import pandas as pd -from sqlalchemy import create_engine, text -import logging -import config - -# 로깅 설정 -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") - -# 연도별 SHAPE 설정 (아래 키워드의 왼쪽의 모양), 이전 연도 데이터는 주석 처리하고, 실제로 사용할 연도만 활성화 -YEARLY_SHAPES = { - 2019: "▶", - 2020: "❑", - 2021: "❑", - 2022: "❑", - 2023: "❑", - 2024: "❑" -} - -# 연도별 키워드 설정 (찾아야하는 표 바로 위의 키워드), 이전 연도 데이터는 주석 처리하고, 실제로 사용할 연도만 활성화 -YEARLY_KEYWORDS = { - 2019: [ - "융합학과", "기계공학부", "메카트로닉스공학부 생산시스템전공", "메카트로닉스공학부 제어시스템전공", "메카트로닉스공학부 디지털시스템전공", - "전기・전자・통신공학부 전기공학전공", "전기・전자・통신공학부 전자공학전공", "전기・전자・통신공학부 정보통신공학전공", - "컴퓨터공학부", "디자인・건축공학부 디자인공학전공", "디자인・건축공학부 건축공학전공", - "에너지신소재화학공학부 에너지신소재공학전공", "에너지신소재화학공학부 응용화학공학전공", "산업경영학부 산업경영전공", "산업경영학부 혁신경영전공" - ], - 2020: [ - "융합학과", "기계공학부", "메카트로닉스공학부 생산시스템전공", "메카트로닉스공학부 제어시스템전공", "메카트로닉스공학부 디지털시스템전공", - "전기・전자・통신공학부 전기공학전공", "전기・전자・통신공학부 전자공학전공", "전기・전자・통신공학부 정보통신공학전공", - "컴퓨터공학부", "디자인・건축공학부 디자인공학전공", "디자인・건축공학부 건축공학전공", - "에너지신소재화학공학부 에너지신소재공학전공", "에너지신소재화학공학부 응용화학공학전공", "산업경영학부 산업경영전공", "산업경영학부 혁신경영전공" - ], - 2021: [ - "융합학과", "기계공학부", "메카트로닉스공학부 생산시스템전공", "메카트로닉스공학부 제어시스템전공", "메카트로닉스공학부 디지털시스템전공", - "전기・전자・통신공학부 전기공학전공", "전기・전자・통신공학부 전자공학전공", "전기・전자・통신공학부 정보통신공학전공", - "컴퓨터공학부", "디자인・건축공학부 디자인공학전공", "디자인・건축공학부 건축공학전공", - "에너지신소재화학공학부 에너지신소재공학전공", "에너지신소재화학공학부 응용화학공학전공", "산업경영학부" - ], - 2022: [ - "융합학과", "기계공학부", "메카트로닉스공학부 생산시스템전공", "메카트로닉스공학부 제어시스템전공", "메카트로닉스공학부 디지털시스템전공", - "전기・전자・통신공학부 전기공학전공", "전기・전자・통신공학부 전자공학전공", "전기・전자・통신공학부 정보통신공학전공", - "컴퓨터공학부", "디자인・건축공학부 디자인공학전공", "디자인・건축공학부 건축공학전공", - "에너지신소재화학공학부 에너지신소재공학전공", "에너지신소재화학공학부 응용화학공학전공", "산업경영학부", "데이터경영전공", "고용서비스정책학과" - ], - 2023: [ - "융합학과 스페셜트랙", "기계공학부", "메카트로닉스공학부 생산시스템전공", "메카트로닉스공학부 제어시스템전공", "메카트로닉스공학부 디지털시스템전공", - "전기・전자・통신공학부 전기공학전공", "전기・전자・통신공학부 전자공학전공", "전기・전자・통신공학부 정보통신공학전공", - "컴퓨터공학부", "디자인・건축공학부 디자인공학전공", "디자인・건축공학부 건축공학전공", - "에너지신소재화학공학부 에너지신소재공학전공", "에너지신소재화학공학부 응용화학공학전공", "융합경영전공", "데이터경영전공", "고용서비스정책학과" - ], - 2024: [ - "융합학과 스페셜트랙", "기계공학부", "메카트로닉스공학부 생산시스템전공", "메카트로닉스공학부 제어시스템전공", "메카트로닉스공학부 디지털시스템전공", - "전기・전자・통신공학부 전기공학전공", "전기・전자・통신공학부 전자공학전공", "전기・전자・통신공학부 정보통신공학전공", - "컴퓨터공학부", "디자인・건축공학부 디자인공학전공", "디자인・건축공학부 건축공학전공", - "에너지신소재화학공학부 에너지신소재공학전공", "에너지신소재화학공학부-화학생명공학전공", "융합경영전공", "데이터경영전공", "고용서비스정책학과" - ] -} - - -# 표 헤더 (모든 연도에 대해 동일하게 유지) -TARGET_HEADER = ["교과목코드", "교과목명", "학-강-실-설", "이수구분"] - - -def is_similar_header(cleaned_header, target_header): - def matches_target(cleaned_col, target_col): - return target_col in cleaned_col - - for target_col in target_header: - if not any(matches_target(cleaned_col, target_col) for cleaned_col in cleaned_header): - return False - return True - - -def clean_header(header): - return [col.replace("\n", "").strip() for col in header if col and col.strip()] - -def normalize_text(text): - if not text: - return "" - normalized_text = re.sub(r"[・・·]", "・", text) - normalized_text = normalized_text.replace("・", "") - normalized_text = re.sub(r"\s+", "", normalized_text) - return normalized_text.strip() - -def extract_and_merge_tables(pdf_path, keyword): - merged_table = [] - is_table_continued = False - found_table_for_keyword = False - - try: - with pdfplumber.open(pdf_path) as pdf: - for i, page in enumerate(pdf.pages): - text = page.extract_text() - if not text: - continue - - cleaned_text = normalize_text(text) - shape = YEARLY_SHAPES.get(year, "") - cleaned_keyword = f"{shape}{normalize_text(keyword)}" - - - if cleaned_keyword in cleaned_text or is_table_continued: - logging.info(f"키워드 '{cleaned_keyword}'를 페이지 {i + 1}에서 찾았습니다.") - - tables = page.extract_tables() - if tables: - for table in tables: - if not table: - continue - - current_header = clean_header(table[0]) - current_header = [col for col in current_header if col] - - if is_similar_header(current_header, TARGET_HEADER): - if not found_table_for_keyword: - found_table_for_keyword = True - is_table_continued = True - - for row in table[1:]: - filtered_row = row[:len(TARGET_HEADER)] + [None] * (len(TARGET_HEADER) - len(row)) - merged_table.append(filtered_row) - - else: - if found_table_for_keyword: - logging.info(f"새로운 표가 감지됨 (페이지 {i + 1}). 다음 키워드로 이동.") - return merged_table - - except Exception as e: - logging.error(f"PDF 처리 중 오류 발생: {e}") - - if not merged_table: - logging.warning("병합된 테이블이 없습니다.") - return None - else: - logging.info(f"총 {len(merged_table)}개의 데이터가 병합되었습니다.") - return merged_table - -def process_table_data(merged_table): - try: - if not merged_table: - raise ValueError("유효하지 않은 테이블 데이터입니다.") - - df = pd.DataFrame(merged_table, columns=TARGET_HEADER) - df.dropna(subset=["교과목코드", "교과목명"], inplace=True) - - df['credit'] = df['학-강-실-설'].apply(lambda x: int(x.split('-')[0]) if x and '-' in x else 0) - - logging.info(f"DataFrame 생성 완료. 총 {len(df)}개의 레코드가 처리되었습니다.") - return df - except Exception as e: - logging.error(f"데이터 처리 중 오류 발생: {e}") - raise - -def create_engine_connection(): - try: - db_config = config.DATABASE_CONFIG - - engine = create_engine( - f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['db']}" - ) - logging.info("데이터베이스 연결 성공.") - return engine - except Exception as e: - logging.error(f"데이터베이스 연결 실패: {e}") - raise - -def get_department_and_major(keyword, conn): - """ 학부 및 전공 정보를 파싱하여 department, major 테이블에 저장 또는 조회 """ - normalized_keyword = re.sub(r"[・・·]", "・", keyword) - normalized_keyword = normalized_keyword.replace("・", "").replace("-", " ") - - major_name = None - - # 예외적인 학부의 경우 학부와 전공을 직접 명시 - if "디자인공학전공" in normalized_keyword: - department_name = "디자인공학부" - major_name = f"디자인공학전공" - elif "건축공학전공" in normalized_keyword: - department_name = "건축공학부" - major_name = f"건축공학전공" - elif "데이터경영전공" in normalized_keyword or "융합경영전공" in normalized_keyword: - department_name = "산업경영학부" - major_name = f"{normalized_keyword}" - elif normalized_keyword == "컴퓨터공학부": - department_name = normalized_keyword - major_name = None - elif normalized_keyword == "기계공학부": - department_name = normalized_keyword - major_name = None - elif normalized_keyword == "고용서비스정책학과": - department_name = normalized_keyword - major_name = None - elif "에너지신소재공학전공" in normalized_keyword: - department_name = "에너지신소재공학부" - major_name = f"에너지신소재공학전공" - elif "응용화학공학전공" in normalized_keyword: - department_name = "응용화학공학부" - major_name = f"응용화학공학전공" - elif "화학생명공학전공" in normalized_keyword: - department_name = "화학생명공학부" - major_name = f"화학생명공학전공" - elif "융합학과" in normalized_keyword: - department_name = "학과공통" - major_name = None - else: - parts = normalized_keyword.split(" ") - if "전공" in parts[-1]: - major_name = parts[-1] - department_name = " ".join(parts[:-1]) - else: - department_name = normalized_keyword - - result = conn.execute( - text("SELECT id FROM department WHERE name = :name"), {"name": department_name} - ).fetchone() - if result: - department_id = result[0] - else: - conn.execute(text("INSERT INTO department (name) VALUES (:name)"), {"name": department_name}) - department_id = conn.execute(text("SELECT LAST_INSERT_ID()")).fetchone()[0] - - if department_name == "학과공통": - return department_id, None - - if not major_name: - result = conn.execute( - text("SELECT id FROM major WHERE department_id = :department_id"), {"department_id": department_id} - ).fetchone() - else: - result = conn.execute( - text("SELECT id FROM major WHERE name = :name"), {"name": major_name} - ).fetchone() - - if result: - return department_id, result[0] - - conn.execute( - text("INSERT INTO major (name, department_id) VALUES (:name, :department_id)"), - {"name": major_name, "department_id": department_id} - ) - major_id = conn.execute(text("SELECT LAST_INSERT_ID()")).fetchone()[0] - - return department_id, major_id - - -def get_or_create_course_type_id(course_type_name, conn): - if "필수" in course_type_name: - course_type_name = "전공필수" - elif "선택" in course_type_name: - course_type_name = "전공선택" - - result = conn.execute( - text("SELECT id FROM course_type WHERE name = :name"), {"name": course_type_name} - ).fetchone() - if result: - return result[0] - conn.execute( - text("INSERT INTO course_type (name) VALUES (:name)"), {"name": course_type_name} - ) - new_id = conn.execute(text("SELECT LAST_INSERT_ID()")).fetchone()[0] - logging.info(f"'{course_type_name}' 새 course_type ID 생성: {new_id}") - return new_id - - -def insert_data_to_db(df, engine, year, keyword): - try: - with engine.begin() as conn: - for _, row in df.iterrows(): - if not row["교과목코드"]: - continue - - department_id, major_id = get_department_and_major(keyword, conn) - - course_type_id = get_or_create_course_type_id(row["이수구분"], conn) - - # 중복 확인 - existing = conn.execute( - text( - """ - SELECT id FROM catalog - WHERE year = :year - AND department_id = :department_id - AND major_id = major_id - AND code = :code - AND lecture_name = :lecture_name - AND credit = :credit - AND course_type_id = :course_type_id - """ - ), - { - "year": year, - "department_id": department_id, - "major_id": major_id, - "code": row['교과목코드'], - "lecture_name": row['교과목명'], - "credit": row['credit'], - "course_type_id": course_type_id - } - ).fetchone() - - if existing: - continue - - conn.execute( - text( - """ - INSERT INTO catalog (year, code, lecture_name, department_id, major_id, credit, course_type_id) - VALUES (:year, :code, :lecture_name, :department_id, :major_id, :credit, :course_type_id) - """ - ), - { - "year": year, - "code": row["교과목코드"], - "lecture_name": row["교과목명"], - "department_id": department_id, - "major_id": major_id, - "credit": row["credit"], - "course_type_id": course_type_id, - }, - ) - - except Exception as e: - logging.error(f"데이터베이스 삽입 중 오류 발생: {e}") - raise - -if __name__ == "__main__": - engine = create_engine_connection() - - for year, keywords in YEARLY_KEYWORDS.items(): - pdf_path = f"./pdfs/{year}대학요람.pdf" - - for keyword in keywords: - logging.info(f"--------- {year}년도 대학요람에서 {keyword} 탐색 시작 ---------") - merged_table = extract_and_merge_tables(pdf_path, keyword) - - if merged_table: - df = process_table_data(merged_table) - insert_data_to_db(df, engine, year, keyword) - - logging.info("모든 데이터 삽입 완료.") \ No newline at end of file diff --git "a/crawling/graduation_credit_calculator/pdfs/2019\353\214\200\355\225\231\354\232\224\353\236\214.pdf" "b/crawling/graduation_credit_calculator/pdfs/2019\353\214\200\355\225\231\354\232\224\353\236\214.pdf" deleted file mode 100644 index 3048434..0000000 Binary files "a/crawling/graduation_credit_calculator/pdfs/2019\353\214\200\355\225\231\354\232\224\353\236\214.pdf" and /dev/null differ diff --git "a/crawling/graduation_credit_calculator/pdfs/2020\353\214\200\355\225\231\354\232\224\353\236\214.pdf" "b/crawling/graduation_credit_calculator/pdfs/2020\353\214\200\355\225\231\354\232\224\353\236\214.pdf" deleted file mode 100644 index cb154a9..0000000 Binary files "a/crawling/graduation_credit_calculator/pdfs/2020\353\214\200\355\225\231\354\232\224\353\236\214.pdf" and /dev/null differ diff --git "a/crawling/graduation_credit_calculator/pdfs/2021\353\214\200\355\225\231\354\232\224\353\236\214.pdf" "b/crawling/graduation_credit_calculator/pdfs/2021\353\214\200\355\225\231\354\232\224\353\236\214.pdf" deleted file mode 100644 index 2de0775..0000000 Binary files "a/crawling/graduation_credit_calculator/pdfs/2021\353\214\200\355\225\231\354\232\224\353\236\214.pdf" and /dev/null differ diff --git "a/crawling/graduation_credit_calculator/pdfs/2022\353\214\200\355\225\231\354\232\224\353\236\214.pdf" "b/crawling/graduation_credit_calculator/pdfs/2022\353\214\200\355\225\231\354\232\224\353\236\214.pdf" deleted file mode 100644 index 7a94853..0000000 Binary files "a/crawling/graduation_credit_calculator/pdfs/2022\353\214\200\355\225\231\354\232\224\353\236\214.pdf" and /dev/null differ diff --git "a/crawling/graduation_credit_calculator/pdfs/2023\353\214\200\355\225\231\354\232\224\353\236\214.pdf" "b/crawling/graduation_credit_calculator/pdfs/2023\353\214\200\355\225\231\354\232\224\353\236\214.pdf" deleted file mode 100644 index 2e9f5d2..0000000 Binary files "a/crawling/graduation_credit_calculator/pdfs/2023\353\214\200\355\225\231\354\232\224\353\236\214.pdf" and /dev/null differ diff --git "a/crawling/graduation_credit_calculator/pdfs/2024\353\214\200\355\225\231\354\232\224\353\236\214.pdf" "b/crawling/graduation_credit_calculator/pdfs/2024\353\214\200\355\225\231\354\232\224\353\236\214.pdf" deleted file mode 100644 index bb41766..0000000 Binary files "a/crawling/graduation_credit_calculator/pdfs/2024\353\214\200\355\225\231\354\232\224\353\236\214.pdf" and /dev/null differ diff --git a/crawling/koreatech_article/config.example.py b/crawling/koreatech_article/config.example.py index 1a1a9f4..f7dea69 100644 --- a/crawling/koreatech_article/config.example.py +++ b/crawling/koreatech_article/config.example.py @@ -1,31 +1,11 @@ -MYSQL_CONFIG = { - 'host': '', - 'db': '', - 'user': '', - 'password': '', - 'port': 3306 -} +import importlib.util +import os -PORTAL_CONFIG = { - 'id': '', - 'password': '', - 'ip': '' -} +_parent_config = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'config.py') +_spec = importlib.util.spec_from_file_location('_central_config', _parent_config) +_mod = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_mod) -S3_CONFIG = { - 'aws_access_key_id': '', - 'aws_secret_access_key': '', - 'bucket': '', - 'upload_domain': '' -} - -SLACK_CONFIG = { - 'url': '' -} - -BATCH_CONFIG = { - 'email': '', - 'password': '', - 'token_url': '', # 토큰 발급 API URL - 'notification_api_url': '', # 알림 API URL -} +for _name in dir(_mod): + if not _name.startswith('_'): + globals()[_name] = getattr(_mod, _name) diff --git a/crawling/koreatech_bus/cheonan_commuting.yaml b/crawling/koreatech_bus/cheonan_commuting.yaml deleted file mode 100644 index 0f29f1c..0000000 --- a/crawling/koreatech_bus/cheonan_commuting.yaml +++ /dev/null @@ -1,135 +0,0 @@ -nodes: - - &cheonan_station "천안역(학화호두과자)" - - &nambu "남부오거리(귀뚜라미 보일러)" - - &samryong "삼룡교(유니클로)" - - &guseong "구성동 부광약국" - - &baro "바로약국 앞" - - &dongwoo__singuye_cho__dongwon_ri__yeonchun_ri "동우APT,신계초,운전리,연춘리" - - &dongwoo_apt "동우APT" - - &singuye_cho__dongwon_ri__yeonchun_ri "신계초,운전리,연춘리" - - &joongang "중앙APT" - - &koreatech "한기대" - - &dongil "동일하이빌APT" - - &joogong "주공11단지APT" - - &terminal "터미널(신세계 앞 횡단보도)" - - &jeil "제일고 맞은편(구 교육청)" - - &wonseung "원성동(GS슈퍼)" - - &doojung "두정역" - - &nodong "노동부(천안지방사무소)" - - &neul "늘푸른극동아파트" - - &sungjung "성정지하도(6단지)" - - &jeonja "전자랜드" - - &gwanghye "광혜당약국" - - &choongmu "충무병원(백년돌침대)" - - &sejong "세종아트빌라BS(구 일봉회관)" - - &onyang "온양온천역BS(1번출구)" - - &baebang "배방읍행정복지센터BS" - - &hoseo "호서웨딩홀BS" - - &ktx_exit_3 "천안아산 KTX(3번출구 6번 승강장)" - - &y_city "Y-City(상공회의소 앞)" - - &hanwha "한화 꿈에그린APT BS" - - &yongam "용암마을(하나은행 앞)" - - &sinbang_leechard "신방동 리차드" - - &sinbang_gs "신방동 GS주유소" - - &chungdang "청당동(벽산블루밍)" - - &buyoung "부영APT" - - &hanyang "한양 수자인BS" - - &emart "이마트 아산점" - -to_school: - - route_name: "터미널/천안역" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *terminal - arrival_time: "08:05" - - node_name: *cheonan_station - arrival_time: "08:10" - - node_name: *nambu - arrival_time: "08:12" - - node_name: *samryong - arrival_time: "08:13" - - node_name: *baro - arrival_time: "08:15" - - node_name: *dongwoo__singuye_cho__dongwon_ri__yeonchun_ri - arrival_time: "정차" - - node_name: *koreatech - arrival_time: "08:50" - - - route_name: "두정역/KTX" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *doojung - arrival_time: "07:41" - - node_name: *nodong - arrival_time: "07:43" - - node_name: *neul - arrival_time: "07:44" - - node_name: *sungjung - arrival_time: "07:46" - - node_name: *jeonja - arrival_time: "07:47" - - node_name: *gwanghye - arrival_time: "07:48" - - node_name: *choongmu - arrival_time: "07:51" - - node_name: *yongam - arrival_time: "07:54" - - node_name: *y_city - arrival_time: "07:56" - - node_name: *ktx_exit_3 - arrival_time: "08:01" - - node_name: *chungdang - arrival_time: "08:20" - - node_name: *joongang - arrival_time: "08:40" - - node_name: *koreatech - arrival_time: "08:50" - -from_school: - - route_name: "천안역/터미널" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *koreatech - arrival_time: "18:10" - - node_name: *dongwoo__singuye_cho__dongwon_ri__yeonchun_ri - arrival_time: "정차" - - node_name: *baro - arrival_time: "18:45" - - node_name: *samryong - arrival_time: "18:47" - - node_name: *nambu - arrival_time: "18:48" - - node_name: *cheonan_station - arrival_time: "18:50" - - node_name: *terminal - arrival_time: "18:55" - - - route_name: "KTX/두정역" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *koreatech - arrival_time: "18:10" - - node_name: *joongang - arrival_time: "18:20" - - node_name: *chungdang - arrival_time: "18:40" - - node_name: *ktx_exit_3 - arrival_time: "18:59" - - node_name: *y_city - arrival_time: "19:04" - - node_name: *yongam - arrival_time: "19:06" - - node_name: *choongmu - arrival_time: "19:09" - - node_name: *gwanghye - arrival_time: "19:12" - - node_name: *jeonja - arrival_time: "19:13" - - node_name: *sungjung - arrival_time: "19:14" - - node_name: *neul - arrival_time: "19:16" - - node_name: *nodong - arrival_time: "19:17" - - node_name: *doojung - arrival_time: "19:19" diff --git a/crawling/koreatech_bus/cheonan_shuttle.yaml b/crawling/koreatech_bus/cheonan_shuttle.yaml deleted file mode 100644 index 33413f5..0000000 --- a/crawling/koreatech_bus/cheonan_shuttle.yaml +++ /dev/null @@ -1,126 +0,0 @@ -nodes: - - &koreatech "한기대" - - &terminal "터미널(신세계 앞 횡단보도)" - - &cheonan_station "천안역(학화호두과자)" - - &campus_2 "2캠퍼스(두정캠퍼스)" - - &ktx_exit_3 "천안아산 KTX (3번출구 6번 승강장)" - - &dujeong-station "두정역" - - &cheonan_station_b "천안역(태극당건너BS)" - - &terminal_b "터미널(신세계 반대 횡단보도)" - - &samryong "삼룡교(유니클로)" - - &baro "바로약국 앞" - - -to_school: - - route_name: "토요일 일학습병행대학(08시)" - running_days: [ "SAT" ] - arrival_info: - - node_name: *dujeong-station - arrival_time: "08:00" - - node_name: *terminal - arrival_time: "08:05" - - node_name: *cheonan_station - arrival_time: "08:10" - - node_name: *koreatech - arrival_time: "도착" - - - route_name: "토요일 일학습병행대학(08시 05분)" - running_days: [ "SAT" ] - arrival_info: - - node_name: *campus_2 - arrival_time: "08:05" - - node_name: *dujeong-station - arrival_time: "08:10" - - node_name: *terminal - arrival_time: "08:15" - - node_name: *cheonan_station - arrival_time: "08:20" - - node_name: *koreatech - arrival_time: "도착" - - - route_name: "토요일 천안아산역(08시 15분)" - running_days: [ "SAT" ] - arrival_info: - - node_name: *ktx_exit_3 - arrival_time: "08:15" - - node_name: *koreatech - arrival_time: "도착" - -from_school: - - route_name: "주중(14시 10분)" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *koreatech - arrival_time: "14:10" - - node_name: *terminal - arrival_time: "14:35" - - node_name: *cheonan_station - arrival_time: "14:40" - - node_name: *samryong - arrival_time: "14:45" - - node_name: *baro - arrival_time: "14:48" - - node_name: *koreatech - arrival_time: "15:30" - - - route_name: "주중(20시 30분)" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *koreatech - arrival_time: "20:30" - - node_name: *terminal - arrival_time: "20:55" - - node_name: *cheonan_station - arrival_time: "21:00" - - node_name: *samryong - arrival_time: "미정차" - - node_name: *baro - arrival_time: "미정차" - - node_name: *koreatech - arrival_time: "21:30" - - - route_name: "토요일 일학습병행대학(16시 15분)" - running_days: [ "SAT" ] - arrival_info: - - node_name: *koreatech - arrival_time: "16:15" - - node_name: *cheonan_station_b - arrival_time: "16:45" - - node_name: *terminal_b - arrival_time: "16:55" - - node_name: *dujeong-station - arrival_time: "하차" - - - route_name: "토요일 일학습병행대학(18시 15분)" - running_days: [ "SAT" ] - arrival_info: - - node_name: *koreatech - arrival_time: "18:15" - - node_name: *cheonan_station_b - arrival_time: "18:45" - - node_name: *terminal_b - arrival_time: "18:55" - - node_name: *dujeong-station - arrival_time: "하차" - - - route_name: "토요일 전문대학원(16시 40분)" - running_days: [ "SAT" ] - arrival_info: - - node_name: *campus_2 - arrival_time: "16:40" - - node_name: *dujeong-station - arrival_time: "하차" - - node_name: *terminal - arrival_time: "하차" - - node_name: *cheonan_station - arrival_time: "하차" - - node_name: *ktx_exit_3 - arrival_time: "하차" - - - route_name: "토요일 전문대학원(16시 15분)" - running_days: [ "SAT" ] - arrival_info: - - node_name: *campus_2 - arrival_time: "16:15" - - node_name: *ktx_exit_3 - arrival_time: "하차" \ No newline at end of file diff --git a/crawling/koreatech_bus/cheongju_commuting.yaml b/crawling/koreatech_bus/cheongju_commuting.yaml deleted file mode 100644 index 577582c..0000000 --- a/crawling/koreatech_bus/cheongju_commuting.yaml +++ /dev/null @@ -1,137 +0,0 @@ -nodes: - - &dongnam "동남지구(대원칸타빌 BS)" - - &prau "프라우 삼성산부인과" - - &cheongju_school "청주혜원학교" - - &geumchun "금천광장(농협앞)" - - &dragon_bezing "용담동베이징(초양교회 앞)" - - &sangdang "상당공원 B(안경매니져 성안점 앞)" - - &gym "체육관(NEPA)" - - &sachang "사창사거리(청주고용센터맞은편BS)" - - &gallery_hotel "갤러리호텔(봉명우체국BS)" - - &hs_foret "HS포레(봉명우체국BS)" - - &solbat "솔밭공원" - - &cheongju_station "청주역 A(옥산방면 청주역BS)" - - &bangsu "방서동(다이소)" - - &koreatech "학교" - - &yongarmdong_hyundai "용암동 현대APT" - - &gs_daechung "GS 대청주유소 맞은편(용암동)" - - &seokyodong "석교동 육거리" - - &sangdang_park "상당공원 A(상공회의소)" - - &cheongju_cityhall "청주시청(방아다리)" - - &culture_industry "문화 산업단지(구 제조창)" - - &maria_hospital "성모병원(율량 맥도널드)" - - &scinece_state "과학단지(오창프라자)" - - &electronic_land "분평동 전자랜드" - - &namsung_elementry "남성초등학교" - - &sannam_sugok "산남동 수곡교회" - - &chungbuk_nonghyup "충북원예농협(GS마트)" - - &chungbuk_university_hospital "충북대병원BS" - - &KBS_reverse "KBS맞은편" - - &gaesin_puruzio "개신동 푸르지오BS" - - &samil_apt "삼일아파트BS" - - &busterminal_reverse "시외버스터미널(롯데마트앞)" - - &heungduck_highschool "흥덕고교 BS" - - &cheongju_station_seochon "청주역 서촌동BS" - - &oksangarak_threeri "옥산가락3리BS" - -to_school: - - route_name: "동남지구" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *dongnam - arrival_time: "07:20" - - node_name: *prau - arrival_time: "07:22" - - node_name: *cheongju_school - arrival_time: "07:24" - - node_name: *geumchun - arrival_time: "07:25" - - node_name: *dragon_bezing - arrival_time: "07:28" - - node_name: *sangdang - arrival_time: "07:37" - - node_name: *gym - arrival_time: "07:40" - - node_name: *sachang - arrival_time: "07:42" - - node_name: *hs_foret - arrival_time: "07:46" - - node_name: *solbat - arrival_time: "07:50" - - node_name: *cheongju_station - arrival_time: "07:55" - - node_name: *koreatech - arrival_time: "08:50" - - - route_name: "용암동" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *bangsu - arrival_time: "07:28" - - node_name: *yongarmdong_hyundai - arrival_time: "07:30" - - node_name: *gs_daechung - arrival_time: "07:31" - - node_name: *seokyodong - arrival_time: "07:33" - - node_name: *sangdang_park - arrival_time: "07:36" - - node_name: *culture_industry - arrival_time: "07:40" - - node_name: *maria_hospital - arrival_time: "07:45" - - node_name: *scinece_state - arrival_time: "08:00" - - node_name: *koreatech - arrival_time: "08:50" - -from_school: - - route_name: "동남지구" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *koreatech - arrival_time: "18:10" - - node_name: *cheongju_station - arrival_time: "19:05" - - node_name: *solbat - arrival_time: "19:10" - - node_name: *hs_foret - arrival_time: "19:14" - - node_name: *sachang - arrival_time: "19:18" - - node_name: *gym - arrival_time: "19:20" - - node_name: *sangdang - arrival_time: "19:23" - - node_name: *dragon_bezing - arrival_time: "19:32" - - node_name: *geumchun - arrival_time: "19:35" - - node_name: *cheongju_school - arrival_time: "19:36" - - node_name: *prau - arrival_time: "19:38" - - node_name: *dongnam - arrival_time: "19:40" - - - route_name: "용암동" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *koreatech - arrival_time: "18:10" - - node_name: *scinece_state - arrival_time: "19:00" - - node_name: *maria_hospital - arrival_time: "19:15" - - node_name: *culture_industry - arrival_time: "19:20" - - node_name: *sangdang_park - arrival_time: "19:24" - - node_name: *seokyodong - arrival_time: "19:27" - - node_name: *gs_daechung - arrival_time: "19:29" - - node_name: *yongarmdong_hyundai - arrival_time: "19:30" - - node_name: *bangsu - arrival_time: "19:32" diff --git a/crawling/koreatech_bus/cheongju_shuttle.yaml b/crawling/koreatech_bus/cheongju_shuttle.yaml deleted file mode 100644 index 510acb0..0000000 --- a/crawling/koreatech_bus/cheongju_shuttle.yaml +++ /dev/null @@ -1,217 +0,0 @@ -#nodes: -# - &university "대학(본교)" -# - &bunpyung_electro "분평동 전자랜드" -# - &namseong_elem "남성초등학교" -# - &sannam_church "산남동수곡교회" -# - &horticultural_coop "충북 원예농협(GS 마트)" -# - &chungbuk_uni_hospital_bs "충북대병원BS" -# - &oksan_garak3ri "옥산(가락3리)" -# - &chungjoo_b "청주역 B" -# - &g_well_city "G-WELL CITY" -# - &solbat_park "솔밭공원" -# - &hs_fore "HS포레(봉명우체국BS)" -# - &oc_science_park "오창과학단지" -# - &sm_hospital "성모병원" -# - &sinbong_crossroad "신봉사거리(LPG충전소)" -# - &bongmyoung_four "봉명사거리" -# - &sachang_four "사창사거리" -# - &gym "체육관" -# - &kbs "KBS 맞은편" -# - &sangdang_park_c "상당공원 C(지하상가BS)" -# - &seokgyo_six "석교동 육거리" -# - &bangseo_daiso "방서동(다이소)" -# - &samyoung_gas "삼영가스" -# - &yongam_hyundai "용암동현대@" -# - &bangseo "방서동(하교)" -# - &gaesin_purugio_bs "개신동 푸르지오@ BS" -# - &sam_il_apt_bs "삼일아파트BS" -# - &gs_daechung_opposite "GS 대청주유소 맞은편(용암동)" -# - &seokgyo_six_dup "석교동 육거리" -# - &dongnamji_dae_won "동남지구(대원칸타빌 BS)" -# - &frau_samsung "프라우 삼성산부인과" -# - &cheongju_hye_won_school "청주혜원학교" -# - &geumcheon_plaza "금천광장(농협앞)" -# - &yongdam_beijing "용담동베이징(초양교회 앞)" -# - &sangdang_park_b "상당공원 B(안경매니져 성안점 앞)" -# - &gym_nepa "체육관(NEPA)" -# - &sachang_four_bs "사창사거리(청주고용센터맞은편BS)" -# - &hs_fore_dup "HS포레(봉명우체국BS)" -# - &solbat_park_dup "솔밭공원" -# - &sangdang_park_a "상당공원 A(상공회의소)" -# - &bus_terminal_across "시외버스TM건너(롯데마트 앞)" -# - &culture_industry "문화 산업단지(화생방한약방)" -# - &sm_hospital_yullyang "성모병원(율량 맥도널드)" -# - &heungdeok_high_bs "흥덕고교 BS" -# - &chungjoo_a "청주역 A(서촌동 BS)" -# - &oksan_garak3ri_bs "옥산 (가락3리 BS)" -# - &science_park "과학단지(오창프라자)" -# -#from_school: -# - route_name: "주중(13시 30분)" -# running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] -# arrival_info: -# - node_name: *university -# arrival_time: "13:30" -# - node_name: *bunpyung_electro -# arrival_time: "청주역경유" -# - node_name: *oksan_garak3ri -# arrival_time: "13:55" -# - node_name: *chungjoo_b -# arrival_time: "14:05" -# - node_name: *g_well_city -# arrival_time: "14:09" -# - node_name: *solbat_park -# arrival_time: "14:10" -# - node_name: *hs_fore -# arrival_time: "14:17" -# - node_name: *sachang_four -# arrival_time: "14:22" -# - node_name: *gym -# arrival_time: "14:25" -# - node_name: *sangdang_park_c -# arrival_time: "14:30" -# - node_name: *seokgyo_six -# arrival_time: "14:33" -# - node_name: *samyoung_gas -# arrival_time: "14:36" -# - node_name: *yongam_hyundai -# arrival_time: "14:40" -# - node_name: *sangdang_park_a -# arrival_time: "14:52" -# - node_name: *culture_industry -# arrival_time: "14:57" -# - node_name: *sm_hospital_yullyang -# arrival_time: "15:01" -# - node_name: *science_park -# arrival_time: "15:13" -# - node_name: *university -# arrival_time: "15:50" -# -# - route_name: "주중(15시 30분)" -# running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] -# arrival_info: -# - node_name: *university -# arrival_time: "15:30" -# - node_name: *bunpyung_electro -# arrival_time: "오창과학단지경유" -# - node_name: *oc_science_park -# arrival_time: "하차" -# - node_name: *sm_hospital_yullyang -# arrival_time: "하차" -# - node_name: *sinbong_crossroad -# arrival_time: "하차" -# - node_name: *bongmyoung_four -# arrival_time: "하차" -# - node_name: *sachang_four -# arrival_time: "하차" -# - node_name: *gym -# arrival_time: "하차" -# - node_name: *sangdang_park_c -# arrival_time: "하차" -# - node_name: *seokgyo_six_dup -# arrival_time: "하차" -# - node_name: *samyoung_gas -# arrival_time: "하차" -# - node_name: *yongam_hyundai -# arrival_time: "하차" -# - node_name: *bangseo -# arrival_time: "종점" -# -# - route_name: "주중(20시 00분)" -# running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] -# arrival_info: -# - node_name: *university -# arrival_time: "20:00" -# - node_name: *bunpyung_electro -# arrival_time: "청주역경유" -# - node_name: *oksan_garak3ri -# arrival_time: "20:25" -# - node_name: *chungjoo_b -# arrival_time: "20:32" -# - node_name: *g_well_city -# arrival_time: "20:39" -# - node_name: *solbat_park -# arrival_time: "20:41" -# - node_name: *hs_fore -# arrival_time: "20:45" -# - node_name: *sachang_four -# arrival_time: "20:48" -# - node_name: *gym -# arrival_time: "20:50" -# - node_name: *sangdang_park_c -# arrival_time: "20:53" -# - node_name: *seokgyo_six -# arrival_time: "20:56" -# - node_name: *samyoung_gas -# arrival_time: "21:00" -# - node_name: *yongam_hyundai -# arrival_time: "21:02" -# - node_name: *sangdang_park_a -# arrival_time: "21:15" -# - node_name: *culture_industry -# arrival_time: "21:20" -# - node_name: *sm_hospital_yullyang -# arrival_time: "21:25" -# - node_name: *science_park -# arrival_time: "21:35" -# - node_name: *university -# arrival_time: "21:50" -# -# - route_name: "목,금(22시 10분)" -# running_days: [ "THU", "FRI" ] -# arrival_info: -# - node_name: *university -# arrival_time: "22:10" -# - node_name: *bunpyung_electro -# arrival_time: "오창과학단지경유" -# - node_name: *oc_science_park -# arrival_time: "하차" -# - node_name: *sm_hospital_yullyang -# arrival_time: "하차" -# - node_name: *sinbong_crossroad -# arrival_time: "하차" -# - node_name: *bongmyoung_four -# arrival_time: "하차" -# - node_name: *sachang_four -# arrival_time: "하차" -# - node_name: *gym -# arrival_time: "하차" -# - node_name: *sangdang_park_c -# arrival_time: "하차" -# - node_name: *seokgyo_six_dup -# arrival_time: "하차" -# - node_name: *samyoung_gas -# arrival_time: "하차" -# - node_name: *yongam_hyundai -# arrival_time: "하차" -# - node_name: *bangseo -# arrival_time: "종점" -# -#to_school: -# - route_name: "주중(11시 50분)" -# running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] -# arrival_info: -# - node_name: *bangseo_daiso -# arrival_time: "11:50" -# - node_name: *yongam_hyundai -# arrival_time: "11:51" -# - node_name: *gs_daechung_opposite -# arrival_time: "11:52" -# - node_name: *seokgyo_six_dup -# arrival_time: "11:54" -# - node_name: *sangdang_park_b -# arrival_time: "11:56" -# - node_name: *gym_nepa -# arrival_time: "11:58" -# - node_name: *sachang_four_bs -# arrival_time: "12:00" -# - node_name: *hs_fore_dup -# arrival_time: "12:02" -# - node_name: *solbat_park_dup -# arrival_time: "12:05" -# - node_name: *chungjoo_a -# arrival_time: "12:10" -# - node_name: *oksan_garak3ri_bs -# arrival_time: "12:12" -# - node_name: *university -# arrival_time: "12:50" diff --git a/crawling/koreatech_bus/config.example.properties b/crawling/koreatech_bus/config.example.properties deleted file mode 100644 index d0a3423..0000000 --- a/crawling/koreatech_bus/config.example.properties +++ /dev/null @@ -1,16 +0,0 @@ -dataSource.driverName= -dataSource.protocol= -dataSource.ipAddress= -dataSource.port= -dataSource.username= -dataSource.password= -dataSource.database= - -mongo.host= -mongo.port= -mongo.username= -mongo.password= -mongo.database= - -project.env= -project.domain= diff --git a/crawling/koreatech_bus/daejeon_commuting.yaml b/crawling/koreatech_bus/daejeon_commuting.yaml deleted file mode 100644 index 4ea01e5..0000000 --- a/crawling/koreatech_bus/daejeon_commuting.yaml +++ /dev/null @@ -1,26 +0,0 @@ -nodes: - - &koreatech "한기대" - - &daejeon_station "대전역" - - &compound_terminal "복합터미널" - -to_school: - - route_name: "일요일(18시 20분)" - running_days: [ "SUN" ] - arrival_info: - - node_name: *daejeon_station - arrival_time: "18:20" - - node_name: *compound_terminal - arrival_time: "하차" - - node_name: *koreatech - arrival_time: "하차" - -from_school: - - route_name: "금요일(18시 20분)" - running_days: [ "FRI" ] - arrival_info: - - node_name: *koreatech - arrival_time: "18:20" - - node_name: *compound_terminal - arrival_time: "하차" - - node_name: *daejeon_station - arrival_time: "하차" diff --git a/crawling/koreatech_bus/go.mod b/crawling/koreatech_bus/go.mod deleted file mode 100644 index ea163ef..0000000 --- a/crawling/koreatech_bus/go.mod +++ /dev/null @@ -1,23 +0,0 @@ -module koreatech_bus - -go 1.19 - -require ( - github.com/go-sql-driver/mysql v1.7.0 - go.mongodb.org/mongo-driver v1.10.2 - gopkg.in/yaml.v3 v3.0.1 -) - -require ( - github.com/golang/snappy v0.0.1 // indirect - github.com/klauspost/compress v1.13.6 // indirect - github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect - github.com/pkg/errors v0.9.1 // indirect - github.com/xdg-go/pbkdf2 v1.0.0 // indirect - github.com/xdg-go/scram v1.1.1 // indirect - github.com/xdg-go/stringprep v1.0.3 // indirect - github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect - golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d // indirect - golang.org/x/sync v0.0.0-20210220032951-036812b2e83c // indirect - golang.org/x/text v0.3.7 // indirect -) diff --git a/crawling/koreatech_bus/go.sum b/crawling/koreatech_bus/go.sum deleted file mode 100644 index dcd2f48..0000000 --- a/crawling/koreatech_bus/go.sum +++ /dev/null @@ -1,58 +0,0 @@ -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/go-sql-driver/mysql v1.7.0 h1:ueSltNNllEqE3qcWBTD0iQd3IpL/6U+mJxLkazJ7YPc= -github.com/go-sql-driver/mysql v1.7.0/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= -github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= -github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM= -github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/klauspost/compress v1.13.6 h1:P76CopJELS0TiO2mebmnzgWaajssP/EszplttgQxcgc= -github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe h1:iruDEfMl2E6fbMZ9s0scYfZQ84/6SPL6zC8ACM2oIL0= -github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0= -github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4= -github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= -github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= -github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= -github.com/xdg-go/scram v1.1.1 h1:VOMT+81stJgXW3CpHyqHN3AXDYIMsx56mEFrB37Mb/E= -github.com/xdg-go/scram v1.1.1/go.mod h1:RaEWvsqvNKKvBPvcKeFjrG2cJqOkHTiyTpzz23ni57g= -github.com/xdg-go/stringprep v1.0.3 h1:kdwGpVNwPFtjs98xCGkHjQtGKh86rDcRZN17QEMCOIs= -github.com/xdg-go/stringprep v1.0.3/go.mod h1:W3f5j4i+9rC0kuIEJL0ky1VpHXQU3ocBgklLGvcBnW8= -github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d h1:splanxYIlg+5LfHAM6xpdFEAYOk8iySO56hMFq6uLyA= -github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA= -go.mongodb.org/mongo-driver v1.10.2 h1:4Wk3cnqOrQCn0P92L3/mmurMxzdvWWs5J9jinAVKD+k= -go.mongodb.org/mongo-driver v1.10.2/go.mod h1:z4XpeoU6w+9Vht+jAFyLgVrD+jGSQQe0+CBWFHNiHt8= -golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d h1:sK3txAijHtOK88l68nt020reeT1ZdKLIYetKl95FzVY= -golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/sync v0.0.0-20210220032951-036812b2e83c h1:5KslGYwFpkhGh+Q16bwMP3cOontH8FOep7tGV86Y7SQ= -golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/crawling/koreatech_bus/main.go b/crawling/koreatech_bus/main.go deleted file mode 100644 index ec9043c..0000000 --- a/crawling/koreatech_bus/main.go +++ /dev/null @@ -1,221 +0,0 @@ -package main - -import ( - "bufio" - "context" - "database/sql" - "fmt" - "go.mongodb.org/mongo-driver/bson" - "log" - "os" - "path/filepath" - "runtime" - "strings" - "time" - - _ "github.com/go-sql-driver/mysql" - "go.mongodb.org/mongo-driver/mongo" - "go.mongodb.org/mongo-driver/mongo/options" - "gopkg.in/yaml.v3" -) - -type BusInfo struct { - region string - busType string -} - -var fileMapper = map[string]BusInfo{ - "cheonan_commuting.yaml": {region: "천안", busType: "commuting"}, - "cheonan_shuttle.yaml": {region: "천안", busType: "shuttle"}, - "sejong_commuting.yaml": {region: "세종", busType: "commuting"}, - "daejeon_commuting.yaml": {region: "대전", busType: "commuting"}, - "seoul_commuting.yaml": {region: "서울", busType: "commuting"}, - "cheongju_shuttle.yaml": {region: "청주", busType: "shuttle"}, - "cheongju_commuting.yaml": {region: "청주", busType: "commuting"}, -} - -type Timetable struct { - ToSchool []Route `yaml:"to_school" json:"to_school" bson:"to_school"` - FromSchool []Route `yaml:"from_school" json:"from_school" bson:"from_school"` -} - -type SchoolBus struct { - Region string `yaml:"region" json:"region" bson:"region"` - BusType string `yaml:"bus_type" json:"bus_type" bson:"bus_type"` - Direction string `yaml:"direction" json:"direction" bson:"direction"` - Routes []Route `yaml:"routes" json:"routes" bson:"routes"` -} - -type Route struct { - RouteName string `yaml:"route_name" json:"route_name" bson:"route_name"` - RunningDays []string `yaml:"running_days" json:"running_days" bson:"running_days"` - ArrivalInfo []ArrivalInfo `yaml:"arrival_info" json:"arrival_info" bson:"arrival_info"` -} - -type ArrivalInfo struct { - NodeName string `yaml:"node_name" json:"node_name" bson:"node_name"` - ArrivalTime string `yaml:"arrival_time" json:"arrival_time" bson:"arrival_time"` -} - -func bindingData(data []byte, class interface{}) error { - switch class.(type) { - case *Timetable: - err := yaml.Unmarshal(data, class) - if err != nil { - return fmt.Errorf("error on binding: %w", err) - } - } - return nil -} - -func getBusSchedule(fileName string, class interface{}) error { - data, err := os.ReadFile(fileName) - if err != nil { - return fmt.Errorf("error on reading: %w", err) - } - return bindingData(data, class) -} - -type Properties map[string]string - -func ConnectMongoDB(configs Properties) (client *mongo.Client, ctx context.Context, cancel context.CancelFunc) { - ctx, cancel = context.WithTimeout(context.Background(), 3*time.Second) - - uri := fmt.Sprintf("mongodb://%s:%s@%s:%s", configs["mongo.username"], configs["mongo.password"], configs["mongo.host"], configs["mongo.port"]) - // uri := fmt.Sprintf("%s://%s:%s", "mongodb", configs["mongo.host"], configs["mongo.port"]) - - clientOptions := options.Client().ApplyURI(uri) - client, _ = mongo.Connect(ctx, clientOptions) - - return client, ctx, cancel -} - -func getConfigProperties() (Properties, error) { - configFile, err := os.Open("config.properties") - if err != nil { - panic(err.Error()) - } - - configs := Properties{} - properties, err := fillConfigProperties(configFile, configs, err) - - if err != nil { - return properties, err - } - return configs, nil -} - -func fillConfigProperties(configFile *os.File, configs Properties, err error) (Properties, error) { - scanner := bufio.NewScanner(configFile) - for scanner.Scan() { - aLine := scanner.Text() - - separateIndex := strings.Index(aLine, "=") - if separateIndex == -1 { - continue - } - - key := strings.TrimSpace(aLine[:separateIndex]) - value := strings.TrimSpace(aLine[separateIndex+1:]) - - if len(key) == 0 { - continue - } - configs[key] = value - } - - err = scanner.Err() - if err != nil { - return nil, err - } - - return nil, nil -} - -func main() { - //Config - configs, err := getConfigProperties() - - if err != nil { - panic(err.Error()) - } - - // MongoDB - mongodb, ctx, _ := ConnectMongoDB(configs) - col := mongodb.Database(configs["mongo.database"]).Collection("bus_timetables") - findAndReplaceOptions := options.FindOneAndReplaceOptions{} - findAndReplaceOptions.SetUpsert(true) - - // MySQL - dataSourceName := fmt.Sprintf("%s:%s@%s(%s:%s)/%s", configs["dataSource.username"], configs["dataSource.password"], configs["dataSource.protocol"], configs["dataSource.ipAddress"], configs["dataSource.port"], configs["dataSource.database"]) - mysql, err := sql.Open(configs["dataSource.driverName"], dataSourceName) - if err != nil { - panic(err.Error()) - } - defer func(mysql *sql.DB) { - err := mysql.Close() - if err != nil { - panic(err.Error()) - } - }(mysql) - - _, filename, _, _ := runtime.Caller(0) - pwd := filepath.Dir(filename) - - // 통학버스 - for key, value := range fileMapper { - schoolBus := new(Timetable) - if err := getBusSchedule(filepath.Join(pwd, key), schoolBus); err != nil { - log.Fatal(err) - } - - schoolBusTo, schoolBusFrom := &SchoolBus{ - Region: value.region, - BusType: value.busType, - Direction: "to", - Routes: schoolBus.ToSchool, - }, &SchoolBus{ - Region: value.region, - BusType: value.busType, - Direction: "from", - Routes: schoolBus.FromSchool, - } - - if err := col.FindOneAndReplace(ctx, bson.D{ - {"region", schoolBusTo.Region}, - {"bus_type", schoolBusTo.BusType}, - {"direction", schoolBusTo.Direction}, - }, schoolBusTo, &findAndReplaceOptions); err.Err() != nil { - log.Printf("%s-%s-%s 저장 완료\r\n", schoolBusTo.BusType, schoolBusTo.Region, schoolBusTo.Direction) - } else { - log.Printf("%s-%s-%s 업데이트 완료\r\n", schoolBusTo.BusType, schoolBusTo.Region, schoolBusTo.Direction) - } - - if err := col.FindOneAndReplace(ctx, bson.D{ - {"region", schoolBusFrom.Region}, - {"bus_type", schoolBusFrom.BusType}, - {"direction", schoolBusFrom.Direction}, - }, schoolBusFrom, &findAndReplaceOptions); err.Err() != nil { - log.Printf("%s-%s-%s 저장 완료\r\n", schoolBusFrom.BusType, schoolBusFrom.Region, schoolBusFrom.Direction) - } else { - log.Printf("%s-%s-%s 업데이트 완료\r\n", schoolBusFrom.BusType, schoolBusFrom.Region, schoolBusFrom.Direction) - } - } - - updateVersion(mysql) -} - -func updateVersion(mysql *sql.DB) { - now := time.Now() - version := fmt.Sprintf("%d0_%d", now.Year(), now.UnixMilli()/1000) - if _, err := mysql.Query( - "INSERT INTO versions (version, type) VALUES (?, ?) ON DUPLICATE KEY UPDATE version = ?;", - version, - "shuttle_bus_timetable", - version, - ); err == nil { - log.Printf("%s 버전 업데이트 완료\r\n", version) - } else { - log.Fatal("버전 업데이트 실패\r\n", err) - } -} diff --git a/crawling/koreatech_bus/sejong_commuting.yaml b/crawling/koreatech_bus/sejong_commuting.yaml deleted file mode 100644 index 4a4eb06..0000000 --- a/crawling/koreatech_bus/sejong_commuting.yaml +++ /dev/null @@ -1,67 +0,0 @@ -nodes: - - &complex_nine "9단지중흥S클래스(달콤제작소)" - - &sejong_city_hall "세종시청(자율주행서비스정류장)" - - &sejong_love_church "세종 순복음더사랑교회" - - &fire_department "소방청" - - &lg_electronics "LG전자 세종본점" - - &sejong_gov_complex "정부세종청사정류장(남측)BS" - - &dodam_library "도담풍경채도서관(도담마을6,9단지BS)" - - &bumjigi_village "범지기마을10단지 교회BS" - - &jochiwon_xi_apartment "조치원 자이아파트(노브랜드앞BS)" - #- &jochiwon_burger_king "조치원 버거킹 사거리(LG전자 조치원점)" - - &sinbong "신봉초등학교BS" - - &koreatech "한기대" - -to_school: - - route_name: "세종" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *complex_nine - arrival_time: "07:20" - - node_name: *sejong_city_hall - arrival_time: "07:25" - - node_name: *sejong_love_church - arrival_time: "07:28" - - node_name: *fire_department - arrival_time: "07:34" - - node_name: *lg_electronics - arrival_time: "07:36" - - node_name: *sejong_gov_complex - arrival_time: "07:41" - - node_name: *dodam_library - arrival_time: "07:43" - - node_name: *bumjigi_village - arrival_time: "07:48" - - node_name: *jochiwon_xi_apartment - arrival_time: "08:00" - - node_name: *sinbong - arrival_time: "08:03" - - node_name: *koreatech - arrival_time: "08:50" - -from_school: - - route_name: "세종" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *koreatech - arrival_time: "18:10" - - node_name: *sinbong - arrival_time: "18:57" - - node_name: *jochiwon_xi_apartment - arrival_time: "19:00" - - node_name: *bumjigi_village - arrival_time: "19:12" - - node_name: *dodam_library - arrival_time: "19:17" - - node_name: *sejong_gov_complex - arrival_time: "19:19" - - node_name: *lg_electronics - arrival_time: "19:24" - - node_name: *fire_department - arrival_time: "19:26" - - node_name: *sejong_love_church - arrival_time: "19:32" - - node_name: *sejong_city_hall - arrival_time: "19:35" - - node_name: *complex_nine - arrival_time: "19:40" \ No newline at end of file diff --git a/crawling/koreatech_bus/seoul_commuting.yaml b/crawling/koreatech_bus/seoul_commuting.yaml deleted file mode 100644 index 23b0ade..0000000 --- a/crawling/koreatech_bus/seoul_commuting.yaml +++ /dev/null @@ -1,30 +0,0 @@ -nodes: - - &koreatech "한기대" - - &seoul_gyodae_station "서울 교대역 3호선 14번 출구" - - &dongcheon_station "동천역 환승정류장" - - &jukjeon_simple_station "죽전 간이정류장" - - &south_terminal "남부터미널" - -to_school: - - route_name: "월요일~금요일 (07시20분)" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *seoul_gyodae_station - arrival_time: "07:20" - - node_name: *dongcheon_station - arrival_time: "07:47" - - node_name: *jukjeon_simple_station - arrival_time: "07:50" - - node_name: *koreatech - arrival_time: "08:50" - -from_school: - - route_name: "월요일~금요일 (18시10분)" - running_days: [ "MON", "TUE", "WED", "THU", "FRI" ] - arrival_info: - - node_name: *koreatech - arrival_time: "18:10" - - node_name: *jukjeon_simple_station - arrival_time: "하차" - - node_name: *seoul_gyodae_station - arrival_time: "하차" diff --git a/crawling/koreatech_calendar.py b/crawling/koreatech_calendar.py deleted file mode 100644 index 0308cfb..0000000 --- a/crawling/koreatech_calendar.py +++ /dev/null @@ -1,108 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import re -from urllib.parse import urlparse, parse_qs -import pymysql -import config - -def connect_db(): - conn = pymysql.connect(host=config.DATABASE_CONFIG['host'], - port=config.DATABASE_CONFIG['port'], - user=config.DATABASE_CONFIG['user'], - password=config.DATABASE_CONFIG['password'], - db=config.DATABASE_CONFIG['db']) - return conn - -def crawling(): - cs = [] - - url = "https://www.koreatech.ac.kr/prog/schedule/kor/sub04_01_01_01/1/haksa.do" - - html = requests.get(url, verify=False) - soup = BeautifulSoup(html.text, "html.parser") - - table = soup.find('div', class_= 'schedule_table_web') - trs = table.select('table > tbody > tr') - - year = soup.find('div', class_= 'schdule_title').find('p').text[:4] - - seq = 0 - - for tr in trs: - seq += 1 - - th = tr.find('th') - - if(th != None): - month = str(th.text[:-1]) - month = "%02d" % (int(month)) - - tds = tr.select('td') - - calendar = Calendar(year, None, None, None, None, None, seq, None) - - date = tds[0].text - schedule = tds[1].text - dates = str(date).split('~') - - if(len(dates) == 2): - calendar.start_month = month - calendar.start_day = dates[0] - calendar.schedule = schedule - calendar.is_continued = 1 - - endDates = str(dates[1]).split('.') - cnt = len(endDates) - if(cnt == 1): - calendar.end_month = month - calendar.end_day = endDates[0] - elif(cnt == 2): - calendar.end_month = endDates[0] - calendar.end_day = endDates[1] - else: - calendar.start_month = month - calendar.end_month = month - calendar.start_day = date - calendar.end_day = date - calendar.schedule = schedule - calendar.is_continued = 0 - - cs.append(calendar) - print('updating %s - %s %s' % (str(calendar.year), str(calendar.start_month), str(seq))) - - updateDB(cs) - pass - - -def updateDB(cs): - cur = connection.cursor() - - for c in cs: - try: - sql = "INSERT INTO koin.calendar_universities(year, start_month, end_month, start_day, end_day, schedule, seq, is_continued) \ - VALUES ('%s', '%s', '%s', '%s', '%s', '%s', %s, %s) \ - ON DUPLICATE KEY UPDATE year = %s, seq = %s" - - cur.execute(sql % (c.year, c.start_month, c.end_month, c.start_day, c.end_day, c.schedule, c.seq, c.is_continued, c.year, c.seq)) - - connection.commit() - except Exception as error: - connection.rollback() - print(error) - -class Calendar: - def __init__(self, year, startMonth, endMonth, startDay, endDay, schedule, seq, isContinued): - self.year = year - self.start_month = startMonth - self.end_month = endMonth - self.start_day = startDay - self.end_day = endDay - self.schedule = schedule - self.seq = seq - self.is_continued = isContinued - -if __name__ == "__main__": - # execute only if run as a script - connection = connect_db() - crawling() - connection.close() diff --git a/crawling/koreatech_closed_lecture.py b/crawling/koreatech_closed_lecture.py deleted file mode 100644 index e5e8dc1..0000000 --- a/crawling/koreatech_closed_lecture.py +++ /dev/null @@ -1,94 +0,0 @@ -import pymysql -import urllib3 -import openpyxl -import config -import time - -### static field ### -# 폐강된 강좌 엑셀파일 -filename = 'lecture_closed.xlsx' # 읽어들일 엑셀파일명 -start_row = 4 # 데이터가 시작하는 row -year_col = 'B' # 학년도 column -semester_col = 'C' # 학기 column -code_col = 'D' # 교과목코드 column -name_col = 'E' # 교과목명 column -grades_col = 'G' # 학점 column -class_number_col = 'K' # 분반 column -department_col = 'N' # 개설학과 column -professor_col = 'M' # 교수 column -is_english_col = 'O' # 영어강의여부 column - - -def connect_db(): - urllib3.disable_warnings() - conn = pymysql.connect(host=config.DATABASE_CONFIG['host'], - port=config.DATABASE_CONFIG['port'], - user=config.DATABASE_CONFIG['user'], - password=config.DATABASE_CONFIG['password'], - db=config.DATABASE_CONFIG['db'], - charset='utf8') - return conn - - -def crawling(): - wb = openpyxl.load_workbook(filename=filename) - ws = wb.active - lectures = [] - year = ws['%s%d' % (year_col, start_row)].value - semester = ws['%s%d' % (semester_col, start_row)].value - semester_date = '%s%s' % (year, semester.split('학기')[0]) - - for row in range(start_row, ws.max_row + 1): - code = ws['%s%d' % (code_col, row)].value - name = ws['%s%d' % (name_col, row)].value - grades = ws['%s%d' % (grades_col, row)].value - class_number = ws['%s%d' % (class_number_col, row)].value - department = ws['%s%d' % (department_col, row)].value - professor = ws['%s%d' % (professor_col, row)].value - is_english = ws['%s%d' % (is_english_col, row)].value - lecture = Lecture(semester_date=semester_date, code=code, name=name, grades=grades, class_number=class_number, - department=department, professor=professor, is_english=is_english) - lectures.append(lecture) - - # print(semester_date, code, name, grades, class_number, department, professor, is_english) - - updateDB(lectures, semester_date) - pass - - -def updateDB(lectures, semester_date): - cur = connection.cursor() - try: - for lecture in lectures: - sql = "DELETE FROM koin.lectures WHERE semester_date='%s' and code='%s' and name='%s' and grades='%s' and class='%s' and department='%s' and professor='%s' and is_english='%s'" - - cur.execute(sql % ( - lecture.semester_date, lecture.code, lecture.name, lecture.grades, lecture.class_number, - lecture.department, lecture.professor, lecture.is_english)) - - cur.execute("UPDATE koin.versions SET version = '%s_%d' WHERE type = 'timetable'" % (semester_date, int(time.time()))) - connection.commit() - - except Exception as error: - connection.rollback() - print(error) - - -class Lecture: - def __init__(self, semester_date, code, name, grades, class_number, department, professor, - is_english): - self.semester_date = semester_date - self.code = code - self.name = name - self.grades = grades - self.class_number = class_number - self.department = department - self.professor = professor - self.is_english = is_english - pass - - -if __name__ == "__main__": - connection = connect_db() - crawling() - connection.close() diff --git a/crawling/koreatech_notice.py b/crawling/koreatech_notice.py deleted file mode 100644 index 84f18e0..0000000 --- a/crawling/koreatech_notice.py +++ /dev/null @@ -1,213 +0,0 @@ -import datetime -import requests -from bs4 import BeautifulSoup -import re -import urllib3 -from urllib.parse import urlparse, parse_qs -import pymysql -import json -import config -from slack_notice import filter_nas, notice_to_slack - - -def connect_db(): - urllib3.disable_warnings() - conn = pymysql.connect(host=config.DATABASE_CONFIG['host'], - port=config.DATABASE_CONFIG['port'], - user=config.DATABASE_CONFIG['user'], - password=config.DATABASE_CONFIG['password'], - db=config.DATABASE_CONFIG['db'], - charset='utf8') - return conn - - -noticeIds = { - "14": "NA001", - "15": "NA002", - "16": "NA003", - "17": "NA004" -} - -tags = { - "NA001": "일반공지", - "NA002": "장학공지", - "NA003": "학사공지", - "NA004": "취업공지", - "NA005": "코인공지" -} - - -def crawling(noticeId, ls=10): - nas = [] - tag = noticeIds[noticeId] - boardId = getBoardId(tag) - - if noticeId == "17": - # 취업공지 - host = "https://job.koreatech.ac.kr" - - url = host + "/jobs/notice/jobNoticeList.aspx?page=1" - html = requests.get(url, verify=False) - soup = BeautifulSoup(html.text, "html.parser") - - trs = soup.select('table > tbody > tr') - - for tr in trs: - td = tr.select('td') - author = td[2].text - title = td[3].text - permalink = host + td[3].find('a').get('href') - - parsed_url = urlparse(permalink) - qs = parse_qs(parsed_url.query) - articleNum = qs.get('idx')[0] - - na = NoticeArticle(boardId, title, author, articleNum, permalink) - setContentJob(na) - - nas.append(na) - - print('find... %s %s' % (tag, str(articleNum))) - else: - host = "https://portal.koreatech.ac.kr" - - url = host + "/ctt/bb/bulletin?b=" + str(noticeId) - html = requests.get(url, verify=False) - soup = BeautifulSoup(html.text, "html.parser") - - trs = soup.select('table > tbody > tr') - - for tr in trs: - permalink = host + tr.get('data-url') - - td = tr.select('td') - articleNum = td[0].text.strip() - - if noticeId == "14": - title = td[2].text.strip() - author = td[4].text.strip() - else: - title = td[1].text.strip() - author = td[3].text.strip() - - na = NoticeArticle(boardId, title, author, articleNum, permalink) - setContent(na) - - nas.append(na) - print('find... %s %s' % (tag, str(articleNum))) - - return nas - - -def setContent(na): - html = requests.get(na.permalink, verify=False) - soup = BeautifulSoup(html.text, "html.parser") - - content = soup.find('div', class_="bc-s-post-ctnt-area") - registered_at = soup.find('table', class_="kut-board-title-table").select('tbody > tr > td')[1].text.strip() - content = str(content).replace('src="/ctt/', 'src="https://portal.koreatech.ac.kr/ctt/') - - na.content = re.sub("()", "", str(content)) - na.registered_at = registered_at - - -def setContentJob(na): - html = requests.get(na.permalink, verify=False) - soup = BeautifulSoup(html.text, "html.parser") - - content = soup.find('tr', class_="content") - content = str(content).replace('src="/ctt/', 'src="https://portal.koreatech.ac.kr/ctt/') - content = str(content).replace('src="/cheditors/', 'src="https://job.koreatech.ac.kr/cheditors/') - content = re.sub("()", "", str(content)) - - registered_at = soup.findAll('tr', class_="head")[1].find('td').text - registered_at = str(registered_at) - registered_at = registered_at[0:8] + registered_at[11:] - - na.content = content - na.registered_at = registered_at - - -def getBoardId(tag): - sql = "SELECT id FROM koin.boards WHERE tag = '%s'" - cur = connection.cursor() - cur.execute(sql % tag) - rows = cur.fetchall() - return rows[0][0] # db에 있는 boards의 id - - -def updateDB(nas): - cur = connection.cursor() - - for na in nas: - na.content = na.content.replace("'", """''""") # sql문에서 작은따옴표 이스케이프 처리 - na.title = na.title.replace("'", """''""") # sql문에서 작은따옴표 이스케이프 처리 - try: - notice_sql = "INSERT INTO koin.notice_articles(board_id, title, content, author, hit, is_deleted, article_num, permalink, has_notice, registered_at) \ - VALUES (%d, '%s', '%s', '%s', %d, %d, %d, '%s', %d, '%s') \ - ON DUPLICATE KEY UPDATE title = '%s', content = '%s', author = '%s'" - - notice_query = notice_sql % ( - na.board_id, na.title, na.content, na.author, na.hit, na.is_deleted, int(na.article_num), na.permalink, - na.has_notice, na.registered_at, - na.title, na.content, na.author - ) - - cur.execute(notice_query) - print("NOTICE_QUERY :", na.board_id, na.title[0:31], na.author) - - newNoticeId = cur.lastrowid - - meta = json.dumps({"registered_at": na.registered_at, "permalink": na.permalink}) - - article_sql = "INSERT INTO koin.articles(board_id, title, nickname, content, user_id, ip, meta, is_notice, created_at, notice_article_id) \ - VALUES (%d, '%s', '%s', '%s', %d, '%s', '%s', %d, '%s', %d) \ - ON DUPLICATE KEY UPDATE title = '%s', content = '%s', nickname = '%s'" - - article_query = article_sql % ( - na.board_id, na.title, na.author, na.content, 0, "127.0.0.1", meta, 1, na.registered_at, newNoticeId, - na.title, na.content, na.author - ) - - cur.execute(article_query) - print("ARTICLE_QUERY :", na.board_id, na.title[0:31], na.author) - - connection.commit() - - except Exception as error: - connection.rollback() - print(error) - - -class NoticeArticle: - def __init__(self, boardId, title, author, articleNum, permalink): - self.board_id = boardId - self.title = title - self.content = None - self.author = author - self.hit = 0 - self.is_deleted = 0 - self.article_num = articleNum - self.permalink = permalink - self.has_notice = 0 - self.registered_at = None - pass - - -if __name__ == "__main__": - # execute only if run as a script - articles = [] - connection = connect_db() - for noticeId in noticeIds.keys(): - nas = crawling(noticeId) - print(nas) - - # DB에 없고, 키워드가 들어있는 게시글 필터링 - articles.extend(filter_nas(connection, nas, keywords={"버스", "bus"})) - - updateDB(nas) - - connection.close() - - if articles: - notice_to_slack(articles) diff --git a/crawling/koreatech_portal/config.example.py b/crawling/koreatech_portal/config.example.py index b1b3835..f7dea69 100644 --- a/crawling/koreatech_portal/config.example.py +++ b/crawling/koreatech_portal/config.example.py @@ -1,25 +1,11 @@ -PORTAL_CONFIG = { - 'id': '', - 'pw': '', - 'ip': '' -} +import importlib.util +import os -GMAIL_CONFIG = { - 'id': '', - 'pw': '' -} +_parent_config = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'config.py') +_spec = importlib.util.spec_from_file_location('_central_config', _parent_config) +_mod = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_mod) -MYSQL_CONFIG = { - 'host': '', - 'db': '', - 'user': '', - 'password': '', - 'port': 0 -} - -REDIS_CONFIG = { - 'host': '', - 'port': 0, - 'db': '', - 'password': '' -} +for _name in dir(_mod): + if not _name.startswith('_'): + globals()[_name] = getattr(_mod, _name) diff --git a/crawling/search_migration.py b/crawling/search_migration.py deleted file mode 100644 index dee259a..0000000 --- a/crawling/search_migration.py +++ /dev/null @@ -1,211 +0,0 @@ -# NOTE : 추후 Python 3.6 이상을 쓰면 문자열 모두 f-string으로 변경할 것 -# @Author : 정종우 -# @Modified : 최선문 / 2020.04.29 - -import pymysql -import urllib3 -import bs4 -from enum import Enum -from config import DATABASE_CONFIG - -articleBoard = { - 1: 5, - 2: 6, - 5: 0, - 6: 1, - 7: 2, - 8: 3, - 9: 4, - 10: 8 -} - -# @Desc : 서비스 타입을 나타내는 열거형으로 koin table_id 값을 따른다. -class EServiceType(Enum): - Anonymous = 7 - LostItem = 9 - Market = 10 - Event = 11 - -# @Desc : 이전할 스키마 타입이다. 각 값은 서비스 타입을 따른다. -class ESchemaType(Enum): - # articles는 board_id를 이용해 값을 할당한다. - articles = -1 - temp_articles = EServiceType.Anonymous.value - lost_items = EServiceType.LostItem.value - items = EServiceType.Market.value - event_articles = EServiceType.Event.value - -# @Desc : 검색용 아티클 -class SearchArticle: - def __init__(self, schemaType): - self.schemaType = ESchemaType[schemaType] - self.table_id = self.schemaType.value - self.article_id = 0 - self.user_id = "NULL" - self.__title = "" - self.__content = "" - self.nickname = "NULL" - self.is_deleted = 0 - self.created_at = "" - self.updated_at = "" - - def __str__(self): - return "SearchArticles{" + \ - "table_id='" + str(self.table_id) + '\'' + \ - ", article_id='" + str(self.article_id) + '\'' + \ - ", title='" + str(self.title) + '\'' + \ - ", content='" + str(self.content) + '\'' + \ - ", user_id='" + str(self.user_id) + '\'' + \ - ", nickname='" + str(self.nickname) + '\'' + \ - ", is_deleted='" + str(self.is_deleted) + '\'' + \ - ", created_at='" + str(self.created_at) + '\'' + \ - ", updated_at='" + str(self.updated_at) + '\'' + \ - "}" - - @property - def title(self): - return self.__title - - @title.setter - def title(self, value): - self.__title = self.convertToDbString(value) - - @property - def content(self): - return self.__content - - @content.setter - def content(self, value): - self.__content = self.convertToDbString(value) - - @staticmethod - def convertToDbString(source): - # HACK : DB에 제대로 저장되려면 아래와 같이 바꿔줘야 한다. - return source.replace("\\", "\\\\").replace("'", "\\'") - -# @Desc : DB와 연결한다. -def getConnectionToDB(): - # HACK : 혹시나 URL로 접속한다면 HTTPS 접속 때문에 오류가 생길 수 있다. - # 그럴 때 이 코드를 활성화 하라. - # urllib3.disable_warnings() - - conn = pymysql.connect( - host = DATABASE_CONFIG['host'], - port= DATABASE_CONFIG['port'], - user = DATABASE_CONFIG['user'], - password = DATABASE_CONFIG['password'], - db = DATABASE_CONFIG['db'], - charset = 'utf8', cursorclass = pymysql.cursors.DictCursor) - - return conn - -# @Author : 최선문 -# @Return : SearchArticle -# @Param -# schemaType : ESchemaType에 들어있는 열거형 값의 name이다. -# row : DictCursor로 조회한 레코드다. -# @Desc : 스키마 타입과 행을 이용해서 SearchArticle 객체를 생성한다. -def makeSearchArticle(schemaType, row): - searchArticle = SearchArticle(schemaType) - - # articles 스키마의 경우 table_id를 설정할 때 예외 처리를 해야 한다. - if searchArticle.schemaType == ESchemaType.articles: - if row["board_id"] not in articleBoard: - return None - searchArticle.table_id = articleBoard[row["board_id"]] - # 그 외의 경우는 ESchemaType 값으로 넣는다. - else: - searchArticle.table_id = searchArticle.schemaType.value - - searchArticle.article_id = row["id"] - searchArticle.user_id = row.get("user_id", "NULL") - searchArticle.title = row["title"] - searchArticle.nickname = row["nickname"] - searchArticle.is_deleted = row["is_deleted"] - searchArticle.created_at = row["created_at"] - searchArticle.updated_at = row["updated_at"] - content = row["content"] if row["content"] is not None else "" - soup = bs4.BeautifulSoup(content, features = "html.parser") - searchArticle.content = soup.text.strip() - - return searchArticle - -# @Author : 최선문 -# @Date : 2020.04.28 -# @Param -# schemaType : ESchemaType에 들어있는 열거형의 name이다. -# @Desc : 스키마에 있는 모든 컬럼을 가져와 search_articles로 이전한다. -def migrate(schemaType): - COUNT = 5000 - - # Row를 얻어 온다. - id = 0 - while True: - rows = [] - with connection.cursor() as cursor: - cursor.execute("SELECT * FROM koin.{} LIMIT {}, {}".format(schemaType, id, COUNT)) - rows = cursor.fetchall() - size = len(rows) - - # 더 가져올 행이 없다면 다음 스키마를 조회한다. - if size == 0: - break - - print("[Log] Selected Row From {} to {} : {}".format(id, id + size, size)) - id += size - - # search_articles에 넣는다. - count = 0 - for row in rows: - searchArticle = makeSearchArticle(schemaType, row) - if not searchArticle: - continue - updateDB(searchArticle) - count += 1 - print("[Log] Current row : {}".format(count), end = "\r") - print("\n[Log] Done") - -# @Param -# searchArticle : makeSearchArticle로 생성한 객체다. -# @Desc : search_articles로 해당 row를 insert 한다. -def updateDB(searchArticle): - try: - with connection.cursor() as cursor: - # SQL문 생성 - sql = """ - INSERT INTO koin.search_articles (table_id, article_id, title, content, user_id, nickname, is_deleted, created_at, updated_at) VALUES ('%s', '%s', '%s', '%s', %s, '%s', '%s', '%s', '%s') ON DUPLICATE KEY UPDATE title = '%s', content = '%s', user_id = %s, nickname = '%s', is_deleted = '%s' - """ - - completedSQL = sql % (searchArticle.table_id, searchArticle.article_id, searchArticle.title, searchArticle.content, searchArticle.user_id, searchArticle.nickname, searchArticle.is_deleted, searchArticle.created_at, searchArticle.updated_at, searchArticle.title, searchArticle.content, searchArticle.user_id, searchArticle.nickname, searchArticle.is_deleted) - - # SQL 검증 - # print(completedSQL) - - # 질의 실행 - cursor.execute(completedSQL) - - # 커밋 - connection.commit() - except Exception as error: - print("[Error] Row : {}".format(searchArticle)) - raise error - -if __name__ == "__main__": - print("[Log] Migration Start") - - # DB 연결 - connection = getConnectionToDB() - print("[Log] Connection Succeeded") - - # 스키마 이전 - try: - for schema, value in ESchemaType.__members__.items(): - print("[Log] Start migrating {}".format(schema)) - migrate(schema) - except Exception as error: - connection.rollback() - print("[Error] {}".format(error)) - print("[Log] Rollbacking...") - finally: - connection.close() - print("[Log] Connection Closed") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 054bbab..ea1005c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ bs4 regex urllib3 openpyxl -config +python-dotenv selenium redis lxml