Я получаю некоторые ошибки при попытке отладки следующего кода.
Обратите внимание, что он извлекает данные из примерно 6000 полей из http://europa.eu/youth/volunteering/evs-organisation#open
После анализа каждой страницы проверьте наличие ссылки next ›
внизу.
View-source - это команда на основе браузера. Он говорит браузеру выводить ответ в виде обычного текста, а не отображать его на основе фактического типа содержимого, в данном случае HTML. Вам не нужно включать источник просмотра в свой URL.
Здесь у нас есть скрипт, который извлекает данные из каждого блока и немного их очищает. Функция browse
является общей. Требуется входная ссылка, которая содержит URL и XPath родительского и дочернего элементов, чтобы создать выходной ref. Это просто подход: он еще не перемещается по каждой странице,
В грубом сценарии, который я тестировал, я получил общие результаты, используя //span[@class="ey_badge"]
, затем максимальную страницу, используя
my $page_max = $results / 21;
$page_max = int( $page_max ) == $page_max ? $page_max-- : int( $page_max ) ;
См. Ошибки
martin@linux-3645:~/dev/perl> perl eu.pl
syntax error at eu.pl line 81, near "our "
Global symbol "$iterator_organizations" requires explicit package name at eu.pl line 81.
Can't use global @_ in "my" at eu.pl line 84, near "= @_"
Missing right curly or square bracket at eu.pl line 197, at end of line
Execution of eu.pl aborted due to compilation errors.
martin@linux-3645:~/dev/perl> ^C
martin@linux-3645:~/dev/perl>
Извлекает данные из примерно 6000 полей из http://europa.eu/youth/volunteering/evs-organisation#open
см. Код
use strict;
use warnings FATAL => qw#all#;
use LWP::UserAgent;
use HTML::TreeBuilder::XPath;
use Data::Dumper;
my $handler_relurl = sub { q#https://europa.eu# . $_[0] };
my $handler_trim = sub { $_[0] =~ s#^\s*(.+?)\s*$#$1#r };
my $handler_val = sub { $_[0] =~ s#^[^:]+:\s*##r };
my $handler_split = sub { [ split $_[0], $_[1] ] };
my $handler_split_colon = sub { $handler_split->( qr#; #, $_[0] ) };
my $handler_split_comma = sub { $handler_split->( qr#, #, $_[0] ) };
my $conf = {
url => q#https://europa.eu/youth/volunteering/evs-organisation_en#,
parent => q#//div[@class="vp ey_block block-is-flex"]#,
children => {
internal_url => [ q#//a/@href#, [ $handler_relurl ] ],
external_url => [ q#//i[@class="fa fa-external-link fa-lg"]/parent::p//a/@href#, [ $handler_trim ] ],
title => [ q#//h4# ],
topics => [ q#//div[@class="org_cord"]#, [ $handler_val, $handler_split_colon ] ],
location => [ q#//i[@class="fa fa-location-arrow fa-lg"]/parent::p#, [ $handler_trim ] ],
hand => [ q#//i[@class="fa fa-hand-o-right fa-lg"]/parent::p#, [ $handler_trim, $handler_split_comma ] ],
pic_number => [ q#//p[contains(.,'PIC no')]#, [ $handler_val ] ],
}
};
print Dumper browse( $conf );
sub browse {
my $conf = shift;
my $ref = [ ];
my $lwp_useragent = LWP::UserAgent->new( agent => q#IE 6#, timeout => 10 );
my $response = $lwp_useragent->get( $conf->{url} );
die $response->status_line unless $response->is_success;
my $content = $response->decoded_content;
my $html_treebuilder_xpath = HTML::TreeBuilder::XPath->new_from_content( $content );
my @nodes = $html_treebuilder_xpath->findnodes( $conf->{parent} );
for my $node ( @nodes ) {
push @$ref, { };
while ( my ( $key, $val ) = each %{ $conf->{children} } ) {
my $xpath = $val->[0];
my $handlers = $val->[1] // [ ];
$val = ( $node->findvalues( qq#.$xpath# ) )[0] // next;
$val = $_->( $val ) for @$handlers;
$ref->[-1]->{$key} = $val;
}
}
return $ref;
}
{
'internal_url' => 'https://europa.eu/youth/volunteering/organisation/948417016_en',
'external_url' => 'http://www.apd.ge',
'location' => 'Tbilisi, Georgia',
'title' => '"Academy for Peace and Development" Union',
'topics' => [
'Access for disadvantaged',
'Youth (Participation, Youth Work, Youth Policy)',
'Intercultural/intergenerational education and (lifelong)learning'
],
'pic_number' => '948417016',
'hand' => [
'Receiving',
'Sending'
]
}
our $iterator_organizations = sub {
my ( $browser, $parent ) = @_;
my $url = q#https://europa.eu/youth/volunteering/evs-organisation_en#;
my $nodes = $browser->nodes( url => $url );
my $iterator = sub {
return shift @$nodes;
};
return ( $iterator, 1 );
our $iterator_organizations_b = sub {
my ( $browser, $parent ) = @_;
my $url = q#https://europa.eu/youth/volunteering/evs-organisation_en#;
my $uri = URI->new( $url );
my $xpath = q#//div[@class="vp ey_block block-is-flex"]#;
my $nodes = [ ];
my $page = 0;
my $results = $parent->{results};
my $page_max = $results / 21;
$page_max = int($page_max) == $page_max ? $page_max-- : int($page_max);
my $iterator_uri = sub {
$uri->query_form( page => $page++ );
return $page > 2 ? undef : $uri ; # $page_max;
};
my $iterator_node = sub {
unless ( @$nodes ) {
my $uri = $iterator_uri->( ) // return undef;
my $options = $page == 1 ? { tree => $parent->{_node} } : { url => $uri->as_string };
$nodes = $browser->nodes( %$options, xpath => $xpath );
}
return shift @$nodes;
};
return ( $iterator_node, 0 );
};
our $iterator_organization = sub {
my ( $browser, $parent ) = @_;
my $url = $parent->{internal_url};
my $nodes = $browser->nodes( url => $url );
my $iterator = sub {
return shift @$nodes;
};
return ( $iterator, 1 );
};
sub organizations {
my ( $self, $options ) = ( shift, { @_ } );
my $map = [
$Massweb::Browser::Europa::iterator_organizations,
results => q#.//span[@class="ey_badge"]#,
organizations => [
$Massweb::Browser::Europa::iterator_organizations_b,
internal_url => [ q#.//a/@href#, $Massweb::Browser::Europa::handler_url ],
external_url => [ q#.//i[@class="fa fa-external-link fa-lg"]/parent::p//a/@href#, $Massweb::Browser::handler_trim ],
title => q#.//h4#,
topics => [ q#.//div[@class="org_cord"]#, $Massweb::Browser::handler_val, $Massweb::Browser::handler_list_colon ],
location => [ q#.//i[@class="fa fa-location-arrow fa-lg"]/parent::p#, $Massweb::Browser::handler_trim ],
hand => [ q#.//i[@class="fa fa-hand-o-right fa-lg"]/parent::p#, $Massweb::Browser::handler_trim, $Massweb::Browser::handler_list_comma ],
pic_number => [ q#.//p[contains(.,'PIC no')]#, $Massweb::Browser::handler_val ],
recruiting => [ q#boolean(.//i[@class="fa fa-user-times fa-lg"])#, $Massweb::Browser::handler_bool_rev ],
_ => \&organization,
],
];
my $organizations = $self->browse( map => $map );
return $organizations;
}
sub organization {
my ( $self, $options ) = ( shift, { @_ } );
my $map = [
sub { $Massweb::Browser::Europa::iterator_organization->( $_[0], $options ) },
#title => q#.//h1#,
description => q#.//div[@class="ey_vp_detail_page"]/p#,
];
my $organization = $self->browse( map => $map );
return $organization;
}