Mathdown/src/extractPDF.tsx at 91a2e5f211d9b08dd5fd45fd702274e3c64d4f3e · ut-code/Mathdown · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
// 叩き台でつくったものをそのまま持ってきているが、最終的には(App.tsxと重複部分も多いので)統合させたい。

import { useState, useMemo, useEffect } from "react";
import { Document, Page, pdfjs } from "react-pdf"; // Make sure to import pdfjs
import "react-pdf/dist/Page/AnnotationLayer.css";
import "react-pdf/dist/Page/TextLayer.css";
import hogeLink from "/hoge.md?url";
import { ExtractDefinitions } from "./MDToDefinitions";
import Markdown from "react-markdown";
import rehypeKatex from "rehype-katex";
import remarkMath from "remark-math";
import Tippy from "@tippyjs/react";

// react-pdfから持ってきたcss https://github.com/wojtekmaj/react-pdf/discussions/1407 参照

import "react-pdf/dist/esm/Page/AnnotationLayer.css";
import "react-pdf/dist/esm/Page/TextLayer.css";

// import { text } from "stream/consumers";
// Set the worker source path for pdfjs
pdfjs.GlobalWorkerOptions.workerSrc = `//cdn.jsdelivr.net/npm/pdfjs-dist@${pdfjs.version}/build/pdf.worker.mjs`;
import Textarea from "@mui/joy/Textarea";

type optsObject = { prefix: string; suffix: string };
type pdfType = { numPages: number; getPage: (arg0: number) => any }; // React-pdfで取得されるPDFには、合計ページ数を指す`numPage`属性と、それぞれのページの（文字列などの）情報を含む`getPages`を含む。
// 使用法
// const page = await pdf.getPage(5); // 5ページ目の情報取得
// const textContent = await page.getTextContent(); // getTextContent属性で文字列取得。

export function ExtractPDF({
  pdfName,
  opts,
}: {
  pdfName: string;
  opts: optsObject;
}) {
  const [numPages, setNumPages] = useState<number>(-1);
  const [pageNumber, setPageNumber] = useState<number>(1);
  const [, setHogeMd] = useState("");
  const [result, setResult] = useState<string[]>([]);
  const [explanation, setExplanation] = useState<string>(""); // ユーザー入力の部分。今は暫定的にテキストエリアを置いている。

  const array: number[] = [1, 2, 3, 4, 5, 6, 7, 8];
  const reactArray: any = array.map((index) => {
    // ページ数をどうにか見える化したい。
    return (
      <Page
        pageNumber={index}
        width={850}
        key={index}
        canvasBackground="white"
        scale={1}
        className="design"
      />
    );
  });
  const options = useMemo(
    () => ({
      cMapUrl: `https://unpkg.com/pdfjs-dist@${pdfjs.version}/cmaps/`, // 文字のエンコーディングに関する設定
      cMapPacked: true,
    }),
    [],
  );

  // ページ移動
  const goToPrevPage = (): void => {
    if (pageNumber > 1) {
      setPageNumber(pageNumber - 1);
    }
  };

  const goToNextPage = (): void => {
    if (pageNumber < numPages) {
      setPageNumber(pageNumber + 1);
    }
  };

  // pdfから文字を抜き出す非同期関数
  async function extractTextFromPDF(pdf: pdfType) {
    const resultContent: any[] = [];
    const getPageText = async (pageNum: number) => {
      const page = await pdf.getPage(pageNum);
      const textContent = await page.getTextContent(); // console.log(textContent.items); をしてみると良い。
      const renderedTextContent = textContent.items
        .map((item: { str: string }) => item.str)
        .join("");
      resultContent.push(renderedTextContent);
    };
    for (let i = 1; i <= pdf.numPages; i++) {
      // グローバルのnumPageでなく、pdf.numPagesなのは、初回レンダー時にpdfのページ数が必要になるため。
      await getPageText(i);
    }
    setResult(resultContent);
  }

  // pdfが読み込み成功した際に実行される非同期関数。
  async function onDocumentLoadSuccess(pdf: pdfType) {
    await extractTextFromPDF(pdf);
    setNumPages(pdf.numPages);
  }

  // マークダウンの内容を取得するためのフック。
  useEffect(() => {
    fetch(hogeLink)
      .then((res) => res.text())
      .then((t) => setHogeMd(t))
      .catch((err) => console.error("Error fetching Hoge.md:", err));
  }, []);

  // const dict = ExtractDefinitions(hogeMd, opts.prefix, opts.suffix);

  return (
    <>
      <div className="App">
        <a href="/home">
          <img src="../docs/logo.png" width="12.5%" height="12.5%" />
        </a>
      </div>
      <h2>PDFの解説表示</h2>
      <div className="flex">
        <div className="explanation">
          <div className="textarea">
            <h4>解説書き込み欄</h4>
          </div>
          <div className="terms">
            <p>
              {pageNumber}ページ目 （{numPages}頁中）
            </p>
            <button disabled={pageNumber <= 1} onClick={goToPrevPage}>
              前ページ
            </button>
            <button disabled={pageNumber >= numPages} onClick={goToNextPage}>
              次ページ
            </button>
            <ul>
              <ReferMap
                dictionary={ExtractDefinitions(
                  explanation,
                  opts.prefix,
                  opts.suffix,
                )} // ユーザー入力（暫定）から定義を抜き出している。
                searchString={result.join("") || ""}
                referedString={result[pageNumber - 1]}
              />
            </ul>
          </div>
          <div>
            <Textarea
              placeholder="解説をコピー"
              minRows={14}
              onChange={(e) => {
                setExplanation(e.target.value);
              }}
            />
          </div>
        </div>

        <div className="pdf">
          {/* pdf ビューワ */}
          <div className="pdf_viewer">
            <Document
              file={pdfName}
              options={options}
              onLoadSuccess={onDocumentLoadSuccess}
            >
              {/* <Page pageNumber={pageNumber} height={1200} canvasBackground="red" scale={0.5} className="design" /> */}
              {reactArray}
            </Document>
          </div>
        </div>
      </div>
    </>
  );
}

// MapオブジェクトとPDFから抜き取られた文字列を照合し、条件を満たす定義のみ抜き出す関数。よりよい関数名求む。
function ReferMap({
  dictionary,
  searchString,
  referedString, //  ある条件を満たす用語は、ブラウザ上で水色に変化する。
}: {
  dictionary: Map<string, string>;
  searchString: string;
  referedString: string;
}) {
  // Filter dictionary keys based on whether they are included in the search string
  const filteredKeys = Array.from(dictionary.keys()).filter((key) =>
    searchString.includes(key),
  );

  // Map filtered keys to JSX elements
  const li = filteredKeys.map((key) => (
    <li key={key} className={referedString.includes(key) ? "color_of_li" : ""}>
      <Tippy
        content={
          <Markdown rehypePlugins={[rehypeKatex]} remarkPlugins={[remarkMath]}>
            {dictionary.get(key)}
          </Markdown>
        }
      >
        <span>{key}</span>
      </Tippy>
    </li>
  ));

  return li;
}