Я чаще всего использую PHP и MySQL с CURL
http://en.wikipedia.org/wiki/CURL
Вы можете сделать несколько забавных вещей, таких как запросы на странице результатов поиска и т. Д.источник из гусеничном ходу я используюЯ вырезал некоторые части ради анонимности, но это хороший, почти рабочий пример.Я могу помочь вам запустить его, если потребуется.
<?php
class Crawler {
protected $markup = '';
protected $uri = '';
protected $db_location = "localhost";
protected $db_username = "***";
protected $db_password = "***";
protected $db_name = "***";
public function __construct() {
ini_set('memory_limit', -1);
}
public function getMarkup() {
$markup = "";
$markup = @file_get_contents($this->uri);
return $markup;
}
public function get($type) {
$method = "_get_{$type}";
if (method_exists($this, $method)){
return call_user_method($method, $this);
}
}
protected function db_query($query) {
$connection = mysql_connect($this->db_location,$this->db_username,$this->db_password) or die(mysql_error());
mysql_select_db($this->db_name,$connection) or die(mysql_error()." >> ".$query);
//echo $query."<br/>"; //for debugging
$result = mysql_query($query,$connection) or die (mysql_error()." >> ".$query);
$i = 0;
if($result != 1)
{
while ($data_array = mysql_fetch_array($result))
{
foreach($data_array as $key => $value)
{
$tableArray[$i][$key] = stripslashes($data_array[$key]);
}
$i++;
}
return $tableArray;
}
}
protected function db_insert($table,$array) {
$tableArray = $this->db_query("show columns from ".$table);
$inputString = "";
foreach($tableArray as $key => $value)
{
if (array_key_exists($value[0], $array) && $value[0]) {
$inputString .= "'".addslashes($array[$value[0]])."', ";
} else {
$inputString .= "'', ";
}
}
$inputString = substr($inputString, 0, -2);
$this->db_query("insert into $table values(".$inputString.")");
return mysql_insert_id();
}
protected function _get_data() {
//$scrape['id'] = $this->get('id');
$scrape['name'] = $this->get('name');
$scrape['tags'] = $this->get('tags');
$scrape['stat_keys'] = $this->get('stat_keys');
$scrape['stat_values'] = $this->get('stat_values');
foreach($scrape['stat_values'] as $key => $value) {
$scrape['stat_values'][$key] = trim($scrape['stat_values'][$key]);
if(strpos($value,"<h5>Featured Product</h5>")) {
unset($scrape['stat_values'][$key]);
}
if(strpos($value,"<h5>Featured Company</h5>")) {
unset($scrape['stat_values'][$key]);
}
if(strpos($value,"<h5>Featured Type</h5>")) {
unset($scrape['stat_values'][$key]);
}
if(strpos($value,"sign in")) {
unset($scrape['stat_values'][$key]);
}
if(strpos($value,"/100")) {
unset($scrape['stat_values'][$key]);
}
}
if(sizeof($scrape['tags']) > 0 && is_array($scrape['tags'])) {
foreach($scrape['tags'] as $tag) {
$tag_array[$tag] = $tag_array[$tag] + 1;
}
$scrape['tags'] = $tag_array;
foreach($scrape['tags'] as $key => $tag_count) {
$scrape['tags'][$key] = $tag_count - 1;
}
}
$scrape['stat_values'] = array_merge(array(),$scrape['stat_values']);
return $scrape;
}
protected function _get_images() {
if (!empty($this->markup)){
preg_match_all('/<img([^>]+)\/>/i', $this->markup, $images);
return !empty($images[1]) ? $images[1] : FALSE;
}
}
protected function _get_links() {
if (!empty($this->markup)){
preg_match_all('/<a([^>]+)\>(.*?)\<\/a\>/i', $this->markup, $links);
return !empty($links[1]) ? $links[1] : FALSE;
}
}
protected function _get_id() {
if (!empty($this->markup)){
preg_match_all('/\/wine\/view\/([^`]*?)-/', $this->markup, $links);
return !empty($links[1]) ? $links[1] : FALSE;
}
}
protected function _get_grape() {
if (!empty($this->markup)){
preg_match_all('/ class="linked" style="font-size: 14px;">([^`]*?)<\/a>/', $this->markup, $links);
return !empty($links[1]) ? $links[1] : FALSE;
}
}
}
if($_GET['pass'] == "go") {
$crawl = new Crawler();
$crawl->go();
}
?>