-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathuntitled527408185540810220.php
More file actions
107 lines (82 loc) · 2.82 KB
/
untitled527408185540810220.php
File metadata and controls
107 lines (82 loc) · 2.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
<?php
include 'simple_html_dom.php';
phpinfo();
// new php script to scrape blenderguru for .blend files, videos and relevant descriptions to form into a folder of tutorials.
// the scraper will run through each page of videos, save the title of the page, extract the contents of the description, download the associated .blend file and download the video.
$url = "http://www.blenderguru.com/video-category/tutorials/"; //url to begin scrape
$pagestart=1;
$maxpages=15; //page number to start scrape
/*
$doc=new DOMDocument();
@$doc->loadHTMLFile($url);
$found=false;
var_dump($doc);
$tutorials=$doc->getElementsByTagName("post ");
var_dump($tutorials);
foreach($tutorials as $tutorial){
echo $tutorial->getAttribute('href').'|'.$tutorial->nodeValue."\n";
}
*/
//find elements by simple html dom
function find_posts_on_page($url,$divclass)
{
$html=file_get_html($url);
echo $html;
echo 'getting html document'."\n";
foreach($html->find('div[class='.$divclass.']') as $element){
echo 'finding post elements'."\n\n\n";
echo $element."/br";
}
}
//loop through paginated pages on a selection
function find_next_paginated_page($url,$nextpageclass){
$html=file_get_html($url);
echo 'loading page'."\n";
$nextpage=$html->find('.next');
foreach($nextpage as $nxtpg){
echo 'link info'."\n".$nxtpg."\n";
$linkhref=$nxtpg->href;
echo $linkhref;
}
$linkto=$nextpage->href;
return $linkto;
}
$vidurl='http://www.blenderguru.com/videos/introduction-to-rigging/';
//finding video links on a valid tutorial page search for div class single video, prefer to regex search the plaintext document for this really??
$html=file_get_html($vidurl);
echo 'loading html file';
//echo $html;
$videolink=$html->find('div[class=single-video]');
foreach($videolink as $videos){
echo 'finding videos'."\n";
echo $videos."\n";
echo 'finding vidlink'."\n";
$vidlink=$videos->children(0)->src;
echo $vidlink."\n";
$vidrequest=file_get_contents($vidlink);
echo $vidrequest;
$vid_json_decode=json_decode($vidrequest);
var_dump($vid_json_decode);
$vid_download_url=$vid_json_decode->{'url'};
echo $vid_download_url;
}
echo 'matching string "\n"';
$match_test_pattern='"url":"http://av.vimeo.com/[0-9,/,\n]*.mp4\?[a-z,A-z,0-9,=,_,&]*"';
$match_test_string='"url":"http://av.vimeo.com/50720/122/67723999.mp4?token2=1396572825_d2b6874213a2a3640722ee62b766e869&aksessionid=5ec2044a776e71fd&ns=4';
$find_vid_url=preg_match($match_test_pattern, $match_test_string);
echo $find_vid_url;
/*$html=file_get_html($url);
echo 'loading page'."\n";
$nextpage=$html->find('.next');
foreach($nextpage as $nxtpg){
echo 'link info'."\n".$nxtpg."\n";
$linkhref=$nxtpg->href;
echo $linkhref;
}
$linkto=$nextpage->href;
echo $linkto;
*
*
*/
echo ' end'."\n";
?>