This repository was archived by the owner on Dec 9, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Expand file tree
/
Copy pathpdf2htmlEX.cc
More file actions
447 lines (390 loc) · 15 KB
/
pdf2htmlEX.cc
File metadata and controls
447 lines (390 loc) · 15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
// pdf2htmlEX.cc
//
// Copyright (C) 2012-2015 Lu Wang <coolwanglu@gmail.com>
#include <cstdio>
#include <cstdlib>
#include <cstddef>
#include <cstring>
#include <ctime>
#include <string>
#include <limits>
#include <iostream>
#include <memory>
#include <errno.h>
#include <getopt.h>
#include <poppler-config.h>
#include <goo/GooString.h>
#include <Object.h>
#include <PDFDoc.h>
#include <PDFDocFactory.h>
#include <GlobalParams.h>
#include "pdf2htmlEX-config.h"
#if ENABLE_SVG
#include <cairo.h>
#endif
#include "ArgParser.h"
#include "Param.h"
#include "HTMLRenderer/HTMLRenderer.h"
#include "util/path.h"
#include "util/ffw.h"
#ifdef __MINGW32__
#include "util/mingw.h"
#endif
using namespace std;
using namespace pdf2htmlEX;
Param param;
ArgParser argparser;
void show_usage_and_exit(const char * dummy = nullptr)
{
cerr << "Usage: pdf2htmlEX [options] <input.pdf> [<output.html>]" << endl;
argparser.show_usage(cerr);
exit(EXIT_FAILURE);
}
void show_version_and_exit(const char * dummy = nullptr)
{
cerr << "pdf2htmlEX version " << PDF2HTMLEX_VERSION << endl;
cerr << "Copyright 2012-2015 Lu Wang <coolwanglu@gmail.com> and other contributors" << endl;
cerr << "Libraries: " << endl;
cerr << " poppler " << POPPLER_VERSION << endl;
cerr << " libfontforge " << ffw_get_version() << endl;
#if ENABLE_SVG
cerr << " cairo " << cairo_version_string() << endl;
#endif
cerr << "Default data-dir: " << param.data_dir << endl;
cerr << "Supported image format:";
#ifdef ENABLE_LIBPNG
cerr << " png";
#endif
#ifdef ENABLE_LIBJPEG
cerr << " jpg";
#endif
#if ENABLE_SVG
cerr << " svg";
#endif
cerr << endl;
cerr << endl;
exit(EXIT_SUCCESS);
}
void embed_parser (const char * str)
{
while(true)
{
switch(*str)
{
case '\0': return; break;
case 'c': param.embed_css = 0; break;
case 'C': param.embed_css = 1; break;
case 'f': param.embed_font = 0; break;
case 'F': param.embed_font = 1; break;
case 'i': param.embed_image = 0; break;
case 'I': param.embed_image = 1; break;
case 'j': param.embed_javascript = 0; break;
case 'J': param.embed_javascript = 1; break;
case 'o': param.embed_outline = 0; break;
case 'O': param.embed_outline = 1; break;
default:
cerr << "Unknown character `" << (*str) << "` for --embed" << endl;
break;
}
++ str;
}
}
void prepare_directories()
{
std::string tmp_dir = param.tmp_dir + "/pdf2htmlEX-XXXXXX";
errno = 0;
unique_ptr<char[]> pBuf(new char[tmp_dir.size() + 1]);
strcpy(pBuf.get(), tmp_dir.c_str());
auto p = mkdtemp(pBuf.get());
if(p == nullptr)
{
const char * errmsg = strerror(errno);
if(!errmsg)
{
errmsg = "unknown error";
}
cerr << "Cannot create temp directory: " << errmsg << endl;
exit(EXIT_FAILURE);
}
param.tmp_dir = pBuf.get();
}
void parse_options (int argc, char **argv)
{
argparser
// pages
.add("first-page,f", ¶m.first_page, 1, "first page to convert")
.add("last-page,l", ¶m.last_page, numeric_limits<int>::max(), "last page to convert")
// dimensions
.add("zoom", ¶m.zoom, 0, "zoom ratio", true)
.add("fit-width", ¶m.fit_width, 0, "fit width to <fp> pixels", true)
.add("fit-height", ¶m.fit_height, 0, "fit height to <fp> pixels", true)
.add("use-cropbox", ¶m.use_cropbox, 1, "use CropBox instead of MediaBox")
.add("hdpi", ¶m.h_dpi, 144.0, "horizontal resolution for graphics in DPI")
.add("vdpi", ¶m.v_dpi, 144.0, "vertical resolution for graphics in DPI")
// output files
.add("embed", "specify which elements should be embedded into output", embed_parser, true)
.add("embed-css", ¶m.embed_css, 1, "embed CSS files into output")
.add("embed-font", ¶m.embed_font, 1, "embed font files into output")
.add("embed-image", ¶m.embed_image, 1, "embed image files into output")
.add("embed-javascript", ¶m.embed_javascript, 1, "embed JavaScript files into output")
.add("embed-outline", ¶m.embed_outline, 1, "embed outlines into output")
.add("split-pages", ¶m.split_pages, 0, "split pages into separate files")
.add("dest-dir", ¶m.dest_dir, ".", "specify destination directory")
.add("css-filename", ¶m.css_filename, "", "filename of the generated css file")
.add("page-filename", ¶m.page_filename, "", "filename template for split pages ")
.add("outline-filename", ¶m.outline_filename, "", "filename of the generated outline file")
.add("process-nontext", ¶m.process_nontext, 1, "render graphics in addition to text")
.add("process-outline", ¶m.process_outline, 1, "show outline in HTML")
.add("process-annotation", ¶m.process_annotation, 0, "show annotation in HTML")
.add("process-form", ¶m.process_form, 0, "include text fields and radio buttons")
.add("printing", ¶m.printing, 1, "enable printing support")
.add("fallback", ¶m.fallback, 0, "output in fallback mode")
.add("tmp-file-size-limit", ¶m.tmp_file_size_limit, -1, "Maximum size (in KB) used by temporary files, -1 for no limit")
// fonts
.add("embed-external-font", ¶m.embed_external_font, 1, "embed local match for external fonts")
.add("font-format", ¶m.font_format, "woff", "suffix for embedded font files (ttf,otf,woff,svg)")
.add("decompose-ligature", ¶m.decompose_ligature, 0, "decompose ligatures, such as \uFB01 -> fi")
.add("auto-hint", ¶m.auto_hint, 0, "use fontforge autohint on fonts without hints")
.add("external-hint-tool", ¶m.external_hint_tool, "", "external tool for hinting fonts (overrides --auto-hint)")
.add("stretch-narrow-glyph", ¶m.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding them")
.add("squeeze-wide-glyph", ¶m.squeeze_wide_glyph, 1, "shrink wide glyphs instead of truncating them")
.add("override-fstype", ¶m.override_fstype, 0, "clear the fstype bits in TTF/OTF fonts")
.add("process-type3", ¶m.process_type3, 0, "convert Type 3 fonts for web (experimental)")
// text
.add("heps", ¶m.h_eps, 1.0, "horizontal threshold for merging text, in pixels")
.add("veps", ¶m.v_eps, 1.0, "vertical threshold for merging text, in pixels")
.add("space-threshold", ¶m.space_threshold, (1.0/8), "word break threshold (threshold * em)")
.add("font-size-multiplier", ¶m.font_size_multiplier, 4.0, "a value greater than 1 increases the rendering accuracy")
.add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets")
.add("tounicode", ¶m.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)")
.add("optimize-text", ¶m.optimize_text, 0, "try to reduce the number of HTML elements used for text")
.add("correct-text-visibility", ¶m.correct_text_visibility, 0, "try to detect texts covered by other graphics and properly arrange them")
// background image
.add("bg-format", ¶m.bg_format, "png", "specify background image format")
.add("svg-node-count-limit", ¶m.svg_node_count_limit, -1, "if node count in a svg background image exceeds this limit,"
" fall back this page to bitmap background; negative value means no limit")
.add("svg-embed-bitmap", ¶m.svg_embed_bitmap, 1, "1: embed bitmaps in svg background; 0: dump bitmaps to external files if possible")
// encryption
.add("owner-password,o", ¶m.owner_password, "", "owner password (for encrypted files)", true)
.add("user-password,u", ¶m.user_password, "", "user password (for encrypted files)", true)
.add("no-drm", ¶m.no_drm, 0, "override document DRM settings")
// misc.
.add("clean-tmp", ¶m.clean_tmp, 1, "remove temporary files after conversion")
.add("tmp-dir", ¶m.tmp_dir, param.tmp_dir, "specify the location of temporary directory")
.add("data-dir", ¶m.data_dir, param.data_dir, "specify data directory")
.add("poppler-data-dir", ¶m.poppler_data_dir, param.poppler_data_dir, "specify poppler data directory")
.add("debug", ¶m.debug, 0, "print debugging information")
.add("proof", ¶m.proof, 0, "texts are drawn on both text layer and background for proof")
.add("quiet", ¶m.quiet, 0, "perform operations quietly")
// meta
.add("version,v", "print copyright and version info", &show_version_and_exit)
.add("help,h", "print usage information", &show_usage_and_exit)
.add("", ¶m.input_filename, "", "")
.add("", ¶m.output_filename, "", "")
;
try
{
argparser.parse(argc, argv);
}
catch(const char * s)
{
// if s == "", getopt_long would have printed the error message
if(s && s[0])
{
cerr << "Error when parsing the arguments:" << endl;
cerr << s << endl;
}
exit(EXIT_FAILURE);
}
catch(const std::string & s)
{
// if s == "", getopt_long would have printed the error message
if(s != "")
{
cerr << "Error when parsing the arguments:" << endl;
cerr << s << endl;
}
exit(EXIT_FAILURE);
}
}
void check_param()
{
if (param.input_filename == "")
{
show_usage_and_exit();
}
if(param.output_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.output_filename = s.substr(0, s.size() - 4) + ".html";
}
else
{
param.output_filename = s + ".html";
}
}
if(param.page_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.page_filename = s.substr(0, s.size() - 4) + "%d.page";
}
else
{
param.page_filename = s + "%d.page";
}
sanitize_filename(param.page_filename);
}
else
{
// Need to make sure we have a page number placeholder in the filename
if(!sanitize_filename(param.page_filename))
{
// Inject the placeholder just before the file extension
const string suffix = get_suffix(param.page_filename);
param.page_filename = param.page_filename.substr(0, param.page_filename.size() - suffix.size()) + "%d" + suffix;
sanitize_filename(param.page_filename);
}
}
if(param.css_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.css_filename = s.substr(0, s.size() - 4) + ".css";
}
else
{
param.css_filename = s + ".css";
}
}
if(param.outline_filename.empty())
{
const string s = get_filename(param.input_filename);
if(get_suffix(param.input_filename) == ".pdf")
{
param.outline_filename = s.substr(0, s.size() - 4) + ".outline";
}
else
{
if(!param.split_pages)
param.outline_filename = s + ".outline";
}
}
if(false) { }
#ifdef ENABLE_LIBPNG
else if (param.bg_format == "png") { }
#endif
#ifdef ENABLE_LIBJPEG
else if (param.bg_format == "jpg") { }
#endif
#if ENABLE_SVG
else if(param.bg_format == "svg") { }
#endif
else
{
cerr << "Image format not supported: " << param.bg_format << endl;
exit(EXIT_FAILURE);
}
#if not ENABLE_SVG
if(param.process_type3)
{
cerr << "process-type3 is enabled, however SVG support is not built in this version of pdf2htmlEX." << endl;
exit(EXIT_FAILURE);
}
#endif
if((param.font_format == "ttf") && (param.external_hint_tool == ""))
{
cerr << "Warning: No hint tool is specified for truetype fonts, the result may be rendered poorly in some circumstances." << endl;
}
if (param.embed_image && (param.bg_format == "svg") && !param.svg_embed_bitmap)
{
cerr << "Warning: --svg-embed-bitmap is forced on because --embed-image is on, or the dumped bitmaps can't be loaded." << endl;
param.svg_embed_bitmap = 1;
}
}
int main(int argc, char **argv)
{
// We need to adjust these directories before parsing the options.
#if defined(__MINGW32__)
param.data_dir = get_exec_dir(argv[0]);
param.tmp_dir = get_tmp_dir();
#else
char const* tmp = getenv("TMPDIR");
#ifdef P_tmpdir
if (!tmp)
tmp = P_tmpdir;
#endif
#ifdef _PATH_TMP
if (!tmp)
tmp = _PATH_TMP;
#endif
if (!tmp)
tmp = "/tmp";
param.tmp_dir = string(tmp);
param.data_dir = PDF2HTMLEX_DATA_PATH;
#endif
parse_options(argc, argv);
check_param();
//prepare the directories
prepare_directories();
if(param.debug)
cerr << "temporary dir: " << (param.tmp_dir) << endl;
try
{
create_directories(param.dest_dir);
}
catch (const string & s)
{
cerr << s << endl;
exit(EXIT_FAILURE);
}
bool finished = false;
// read config file
globalParams = new GlobalParams(!param.poppler_data_dir.empty() ? param.poppler_data_dir.c_str() : NULL);
// open PDF file
PDFDoc * doc = nullptr;
try
{
{
GooString * ownerPW = (param.owner_password == "") ? (nullptr) : (new GooString(param.owner_password.c_str()));
GooString * userPW = (param.user_password == "") ? (nullptr) : (new GooString(param.user_password.c_str()));
GooString fileName(param.input_filename.c_str());
doc = PDFDocFactory().createPDFDoc(fileName, ownerPW, userPW);
delete userPW;
delete ownerPW;
}
if (!doc->isOk())
throw "Cannot read the file";
// check for copy permission
if (!doc->okToCopy())
{
if (param.no_drm == 0)
throw "Copying of text from this document is not allowed.";
cerr << "Document has copy-protection bit set." << endl;
}
param.first_page = min<int>(max<int>(param.first_page, 1), doc->getNumPages());
param.last_page = min<int>(max<int>(param.last_page, param.first_page), doc->getNumPages());
unique_ptr<HTMLRenderer>(new HTMLRenderer(param))->process(doc);
finished = true;
}
catch (const char * s)
{
cerr << "Error: " << s << endl;
}
catch (const string & s)
{
cerr << "Error: " << s << endl;
}
// clean up
delete doc;
delete globalParams;
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
exit(finished ? (EXIT_SUCCESS) : (EXIT_FAILURE));
return 0;
}