Old notes on screenscraping that formed the basis of Simple scraper module
screenscraping, xml-rpc, drupal screen scraping
XML-RPC
http://blog.riff.org/2005_12_17_drupal_coder_writing_a_drupal_xml_rpc_service
(This guy's using Drupal 5! [before the release candidate, mind you])
HTML Import
http://drupal.org/project/import_html?destination=node%2F46008
http://openconcept.ca/screenscraping_a_la_php
Used:
http://interoperating.info/mark/node/61
Going forward: XML-RPC ?
http://www.xmlrpc.com/directory/1568/implementations
<?php
$domain = 'http://hera.pioneersg.com/';
$path = '';
$page = 'caddetails.asp?pindex=155';
$url = $domain . $path . $page;
ob_start();
include_once $url;
$output = ob_get_contents();
// remove unwanted opening HTML
$output = ereg_replace('^.*<!--CONTENT_DISPLAY_BEGIN-->', '', $output);
// remove unwanted closing HTML
$output = ereg_replace('<!--CONTENT_DISPLAY_END-->.*$', '', $output);
// point image paths to original
$output = ereg_replace('<img src="', '<img src="' . $domain . $path, $output);
// point URLs to original - unfortunately we'd rather not have links to the original
$output = ereg_replace('<a href="', '<a href="' . $domain . $path, $output);
ob_end_clean();
return $output;
?>
<?php
$URL = 'http://drupal.org/user/25887/track';
ob_start();
include_once $URL;
$output = ob_get_contents();
// Get rid of everthing in the incoming data before '<!-- begin content -->'
$output = ereg_replace('^.<!-- begin content -->', '', $output);
// Fix the relative URLs in the incoming data so they point back to drupal.org
$output = ereg_replace('/node/', 'http://drupal.org/node/', $output);
$output = ereg_replace('/user/', 'http://drupal.org/user/', $output);
// Get rid of everthing in the incoming data after '<!-- end content -->'
$output = ereg_replace('<!-- end content -->.$', '', $output);
ob_end_clean();
return $output;
?>
screenscraping, xml-rpc, drupal screen scraping
XML-RPC
http://blog.riff.org/2005_12_17_drupal_coder_writing_a_drupal_xml_rpc_service
(This guy's using Drupal 5! [before the release candidate, mind you])
HTML Import
http://drupal.org/project/import_html?destination=node%2F46008
http://openconcept.ca/screenscraping_a_la_php
Used:
http://interoperating.info/mark/node/61
Going forward: XML-RPC ?
http://www.xmlrpc.com/directory/1568/implementations
<?php
$domain = 'http://hera.pioneersg.com/';
$path = '';
$page = 'caddetails.asp?pindex=155';
$url = $domain . $path . $page;
ob_start();
include_once $url;
$output = ob_get_contents();
// remove unwanted opening HTML
$output = ereg_replace('^.*<!--CONTENT_DISPLAY_BEGIN-->', '', $output);
// remove unwanted closing HTML
$output = ereg_replace('<!--CONTENT_DISPLAY_END-->.*$', '', $output);
// point image paths to original
$output = ereg_replace('<img src="', '<img src="' . $domain . $path, $output);
// point URLs to original - unfortunately we'd rather not have links to the original
$output = ereg_replace('<a href="', '<a href="' . $domain . $path, $output);
ob_end_clean();
return $output;
?>
<?php
$URL = 'http://drupal.org/user/25887/track';
ob_start();
include_once $URL;
$output = ob_get_contents();
// Get rid of everthing in the incoming data before '<!-- begin content -->'
$output = ereg_replace('^.<!-- begin content -->', '', $output);
// Fix the relative URLs in the incoming data so they point back to drupal.org
$output = ereg_replace('/node/', 'http://drupal.org/node/', $output);
$output = ereg_replace('/user/', 'http://drupal.org/user/', $output);
// Get rid of everthing in the incoming data after '<!-- end content -->'
$output = ereg_replace('<!-- end content -->.$', '', $output);
ob_end_clean();
return $output;
?>
Comments
Post new comment