-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathHTML5Parser.php
More file actions
207 lines (190 loc) · 6.14 KB
/
HTML5Parser.php
File metadata and controls
207 lines (190 loc) · 6.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
<?php
/*
$htmldoc = '<!DOCTYPE html><html><head><meta charset="utf-8"></head><body><h1>Error none</h1><p>Page found</p></body></html>';
$parser = new HTML5Parser();
$parser->error(function($err){
// $err is an Exception
});
// Example plugins
$parser->extend(function($dom, $q){
// http://learn.jquery.com/plugins/basic-plugin-creation/
$q->fn->greenify = function($that) use($q){
$that->css( "color", "green" );
};
$q->fn->showLinkLocation = function($that) use($q){
$that = isset($this)?$this:($q::$instance);
return $that->each(function($i, $el) use($q) {
$q( $el )->append( " (" . $q( $el )->attr( "href" ) . ")" );
});
};
$q->sum = function() use($q){
return array_sum(func_get_args());
};
});
$parser->parse($htmldoc, function($dom, $jQuery){
// Works with unicode (was surprisingly hard to achieve)
// $dom holds the document DOM
// Run utility functions with $jQuery->parseHTML($data, $context, $keep_scripts);
// Query the DOM with CSS3 and manipulate the selection: $jQuery('.class-name')->attr('rel', 'external');
// Query the DOM with XPATH and manipulate the selection: $jQuery('descendant::select')->val('');
// Get the <body> content html: $jQuery('body')->html();
});
*/
namespace ThinkHTML\jQuery;
use HTML5_Parser;
use ThinkHTML\jQuery\jQuery;
// HTML parser and document scope creation for jQuery
class HTML5Parser
{
public $dom, $err, $jquery, $error_handle, $extensions = array(), $version = '1.0.0';
public function __construct($options=array())
{
$default = array(
'encoding'=>null,
'default_encoding'=>'utf-8',
'data_camel_case'=>true,
);
$this->opts = array_replace($default, $options);
}
// Error handle
public function error($callback)
{
$this->error_handle = $callback;
}
// Make this class extendable in the same way as jQuery
public function extend($callback){
$this->extensions[] = $callback;
}
public function parse($html, $callback)
{
$dom = $err = null;
try {
if(is_string($html)){
// To know what to save
$this->is_fragment = (strpos($html,'<html')===false) ? true : false;
// Add support for <meta charset="utf-8">
$html = $this->handleEncoding($html);
// Parse the document
//if(strpos($html,'<!DOCTYPE')===false){
// $dom = HTML5_Parser::parseFragment($html);
//}else{
$dom = HTML5_Parser::parse($html);
//}
}else{
$dom = $html;
}
}catch(Exception $err) { }
$this->err = $err;
$this->dom = $dom;
$this->jquery = new jQuery($dom, $this->opts);
foreach($this->extensions as $ext_cb){
call_user_func($ext_cb, $this->dom, $this->jquery);
}
if($err){
if(isset($this->error_handle)){
call_user_func($this->error_handle, $err);
}
}else{
try {
call_user_func($callback, $dom, $this->jquery);
}catch(Exception $err){
if(isset($this->error_handle)){
call_user_func($this->error_handle, $err);
}
}
}
return $dom;
}
public function handleEncoding($html)
{
$encoding = $this->opts['default_encoding'];
// Get encoding and the position of the tag
$content_type = self::getContentType($html);
// Determine the encoding to be used
$encoding = ($content_type===false)? $encoding: $content_type['encoding'];
// Override the encoding if set
$encoding = empty($this->opts['encoding'])?$encoding:$this->opts['encoding'];
// Remove the old encoding tag, which might be valid but still not work for DOMDocument
$html = substr_replace($html, '', $content_type['start'], $content_type['length']);
// Set or re-set the proper encoding
$html = self::setEncoding($html, $encoding);
return $html;
}
public static function setEncoding($html, $encoding)
{
// Remember <meta charset="utf-8"> has no effect on php DOMDocument
// Remember the <head> tag is optional
// The first meta has priority, the later ones are ignored
// Stupid way to inject unicode charset that works and is reliable
$tag = '<meta http-equiv="Content-Type" content="text/html; charset='.$encoding.'">';
return $tag.$html;
}
public static function getContentType($html)
{
//http://stackoverflow.com/questions/4696499/meta-charset-utf-8-vs-meta-http-equiv-content-type
//http://stackoverflow.com/a/10769573/175071
// get the first 512 bytes (for performance)
$block = substr($html, 0, 512);
if(preg_match('@<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)[^>]*>\s*@i', $block, $match, PREG_OFFSET_CAPTURE)){
$data = array();
$data['start'] = $match[0][1];
$data['end'] = $match[0][1] + strlen($match[0][0]);
$data['length'] = strlen($match[0][0]);
$data['content-type'] = $match[1][0];
$data['encoding'] = $match[2][0];
return $data;
}
return false;
}
public static function formatHtml($dom)
{
$dom->formatOutput = true;
$dom->preserveWhitespace = false;
return $dom->saveHTML();
}
public function html()
{
$self = $this;
$this->ready(function($dom, $q) use($self){
$meta = $q('meta[http-equiv="Content-Type"]');
$content_type = $self->getContentType($meta->outerHTML());
$meta_node = $self->dom->createElement('meta');
$meta_node->setAttribute('charset', $content_type['encoding']);
$meta->replaceWith($meta_node);
});
return "<!DOCTYPE html>\r\n".$this->dom->saveHTML($this->dom);
}
/*
public static function innerHtml($node){
$html = '';
if($node instanceof DOMNode && $node->childNodes instanceof DOMNodeList)
{
foreach ($node->childNodes as $child)
{
$html .= $child->ownerDocument->saveHTML($child);
}
}
return $html;
}
*/
/*
# http://www.php.net/manual/en/function.chr.php#55978
# will not convert entities for <> or any of the ASCII chars
public static function unicode_character_reference_decode($str){
return preg_replace("/&#(\d{2,5});/e", "self::unichr($1);", $str);
}
public static function unichr($dec) {
if ($dec < 128) {
$utf = chr($dec);
} else if ($dec < 2048) {
$utf = chr(192 + (($dec - ($dec % 64)) / 64));
$utf .= chr(128 + ($dec % 64));
} else {
$utf = chr(224 + (($dec - ($dec % 4096)) / 4096));
$utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64));
$utf .= chr(128 + ($dec % 64));
}
return $utf;
}
*/
}