Here's a simple PHP class I wrote to crawl a URL and return a list of internal and external URLs. I've used it in the past for development purposes [only] to find 404s and repetition in URL structure. IE: It does not read in robots.txt files or obey any similar rules. Just thought I'd pull it out of the archives and share on the web..
#!/usr/bin/php
It can be used as such..
#!/usr/bin/php
<?php
class Crawl {
protected $regex_link;
protected $website_url;
protected $website_url_base;
protected $urls_processed;
protected $urls_external;
protected $urls_not_processed;
protected $urls_ignored;
public function __construct($website_url = NULL) {
// enable error tracking, grr.
ini_set('track_errors', true);
// setup variables
$this->regex_link = "/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]/isU";
$this->urls_processed = array();
$this->urls_external = array();
$this->urls_not_processed = array();
$this->urls_ignored = array(
'/search/apachesolr_search/',
'/comment/reply/',
);
// validate argument(s)
$result = $this->validate_arg_website_url($website_url);
// error check
if (!$result) {
return FALSE;
}
// set website argument
$this->website_url = $website_url;
// get url base
$url_base = $this->get_url_base($this->website_url);
// error check
if (!$url_base) {
return;
}
// set website url base
$this->website_url_base = $url_base;
// add url to list of urls to process
$this->urls_not_processed[] = $this->website_url;
while(count($this->urls_not_processed)) {
$this->process_urls_not_processed();
}
// sort data
sort($this->urls_processed);
sort($this->urls_external);
}
protected function validate_arg_website_url($website_url = NULL) {
// validate argument
if (!(is_string($website_url) && (substr($website_url,0,7)=='http://' || substr($website_url,0,8)=='https://'))) {
return FALSE;
}
return TRUE;
}
protected function get_url_base($url = NULL) {
// validate url
if (!$url || !strlen($url)) {
return FALSE;
}
$url_parts = parse_url($url);
// validate
if (!is_array($url_parts)) {
return FALSE;
}
// explode host on '.'
$exploded = explode('.', $url_parts['host']);
// return host and domain extension
$url_base = $exploded[count($exploded)-2] . '.' . $exploded[count($exploded)-1];
return $url_base;
}
protected function scan_url($url) {
// validate url
if (!is_string($url) || !$url || !strlen($url)) {
return FALSE;
}
// ensure url has not already been processed
if (in_array($url, $this->urls_processed)) {
return FALSE;
}
// add url to processed list
$this->urls_processed[] = $url;
// remove any previously saved errors
unset($php_errormsg);
// load page contents
$page_contents = file_get_contents($url);
// check for error when loading url; text starting with "file_get_contents"
$error_text = 'file_get_contents';
if (isset($php_errormsg) && substr($php_errormsg,0,strlen($error_text))==$error_text) {
return FALSE;
}
// check for additional errors
elseif ($page_contents === false || !strlen($page_contents)) {
return FALSE;
}
// execute regex
preg_match_all($this->regex_link, $page_contents, $matches);
if (is_array($matches) && isset($matches[1])) {
return array_unique($matches[1]);
}
return FALSE;
}
protected function process_matches($matches = NULL) {
// validate
if (!$matches || !is_array($matches) || empty($matches)) {
return FALSE;
}
foreach ($matches as $match) {
// ensure match exists
if (empty($match)) {
continue;
}
// ignore anchors
elseif (substr($match,0,1)=='#') {
continue;
}
// ignore javascript
elseif (substr($match,0,11)=='javascript:') {
continue;
}
// ignore mailto
elseif (substr($match,0,7)=='mailto:') {
continue;
}
// check for internal urls that begin with '/'
if (substr($match,0,1)=='/') {
$match = 'http://' . $this->website_url_base . $match;
}
// remove trailing slash
if (substr($match, -1)=='/') {
$match = substr($match, 0, -1);
}
// ensure href starts with http or https
// NOTE: this needs work, URL could begin with relative paths like '../', ftp://, etc.
if (!(substr($match,0,7)=='http://' || substr($match,0,8)=='https://')) {
$match = 'http://' . $this->website_url_base . '/' . $match;
}
// check if url is to be ignored
foreach ($this->urls_ignored as $ignored) {
if (stripos($match, $ignored) !== FALSE) {
continue 2;
}
}
// get url base
$url_base = $this->get_url_base($match);
// check for external url
if ($url_base != $this->website_url_base) {
if (!in_array($match, $this->urls_external)) {
$this->urls_external[] = $match;
}
continue;
}
// check if url has already been processed
if (in_array($match, $this->urls_processed)) {
continue;
}
// add url to list of urls to process
if (!in_array($match, $this->urls_not_processed)) {
$this->urls_not_processed[] = $match;
}
// end: foreach
}
return TRUE;
}
protected function process_urls_not_processed() {
if (empty($this->urls_not_processed)) {
return FALSE;
}
// get unprocessed url
$url = array_shift($this->urls_not_processed);
// scan url
$matches = $this->scan_url($url);
// error check
if (!$matches || !is_array($matches) || empty($matches)) {
return FALSE;
}
$this->process_matches($matches);
}
public function output_all_urls() {
echo "===== INTERNAL URLS =====\n";
foreach ($this->urls_processed as $url) {
print $url . "\n";
}
echo "===== EXTERNAL URLS =====\n";
foreach ($this->urls_external as $url) {
print $url . "\n";
}
}
}
?>
It can be used as such..
<?php
$website_url = 'http://www.example.com';
$crawl = new Crawl($website_url);
$crawl->output_all_urls();
?>
No comments:
Post a Comment
Thanks for your comment.