получить самые большие изображения - PullRequest
0 голосов
/ 13 апреля 2011

Я делаю проект поиска изображений, и я хочу получить самые большие изображения на одной странице.Я добавил код, чтобы исправить реальный адрес изображения, удалить изображения, которые являются возможной рекламой.сравнение там ширина * высота отражается от самого большого.но мой код имеет некоторые проблемы.вот весь мой кодМожет кто-нибудь помочь мне, чтобы исправить, где это не так и как оптимизировать код, я чувствую, процесс медленно и горько.Спасибо всем.

<?php
require_once 'simple_html_dom.php';
require 'url_to_absolute.php'; //get image absolute url
$v = 'http://www.yomiuri.co.jp/stream/';
$html = file_get_html($v);
$maxsize = -1; 
$the_biggest_image = false;
$arr = array('ad', 'ads','gif');// add ads possible words as a arry which is check in the image url
foreach($html->find('img') as $element) {
    preg_match_all('#https?://(.*?)($|/)#m', urldecode(stripcslashes($v)), $r); //get site base url
    $pic = $element->src;
    $comm = url_to_absolute( $r[0][0], $pic);//get image absolute url
    $check_flag = true;
    foreach($arr as $item) {
        if (substr_count(strtolower($comm),$item) > 0) $check_flag = false;
    }// remove ads images
    if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
    reset($comm);
        if (($arr[0] * $arr[1]) > $maxsize) {   
            $maxsize = $arr[0] * $arr[1];  //compare images' sise
            $the_biggest_image = $comm;
            echo '<img src="'.$the_biggest_image.'" />'; //echo the biggest one
        }
}
?>

url_to_absolute.php

<?php
/**
 * Edited by Nitin Kr. Gupta, publicmind.in
 */

/**
 * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 *  * Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials provided
 *    with the distribution.
 *
 *  * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
 *    the names of its contributors may be used to endorse or promote
 *    products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
 * OF SUCH DAMAGE.
 */

/*
 * This is a BSD License approved by the Open Source Initiative (OSI).
 * See:  http://www.opensource.org/licenses/bsd-license.php
 */

/**
 * Combine a base URL and a relative URL to produce a new
 * absolute URL.  The base URL is often the URL of a page,
 * and the relative URL is a URL embedded on that page.
 *
 * This function implements the "absolutize" algorithm from
 * the RFC3986 specification for URLs.
 *
 * This function supports multi-byte characters with the UTF-8 encoding,
 * per the URL specification.
 *
 * Parameters:
 *  baseUrl     the absolute base URL.
 *
 *  url     the relative URL to convert.
 *
 * Return values:
 *  An absolute URL that combines parts of the base and relative
 *  URLs, or FALSE if the base URL is not absolute or if either
 *  URL cannot be parsed.
 */
function url_to_absolute( $baseUrl, $relativeUrl )
{
    // If relative URL has a scheme, clean path and return.
    $r = split_url( $relativeUrl );
    if ( $r === FALSE )
        return FALSE;
    if ( !empty( $r['scheme'] ) )
    {
        if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
            $r['path'] = url_remove_dot_segments( $r['path'] );
        return join_url( $r );
    }

    // Make sure the base URL is absolute.
    $b = split_url( $baseUrl );
    if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
        return FALSE;
    $r['scheme'] = $b['scheme'];

    // If relative URL has an authority, clean path and return.
    if ( isset( $r['host'] ) )
    {
        if ( !empty( $r['path'] ) )
            $r['path'] = url_remove_dot_segments( $r['path'] );
        return join_url( $r );
    }
    unset( $r['port'] );
    unset( $r['user'] );
    unset( $r['pass'] );

    // Copy base authority.
    $r['host'] = $b['host'];
    if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
    if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
    if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];

    // If relative URL has no path, use base path
    if ( empty( $r['path'] ) )
    {
        if ( !empty( $b['path'] ) )
            $r['path'] = $b['path'];
        if ( !isset( $r['query'] ) && isset( $b['query'] ) )
            $r['query'] = $b['query'];
        return join_url( $r );
    }

    // If relative URL path doesn't start with /, merge with base path
    if ( $r['path'][0] != '/' )
    {
        $base = mb_strrchr( $b['path'], '/', TRUE, 'UTF-8' );
        if ( $base === FALSE ) $base = '';
        $r['path'] = $base . '/' . $r['path'];
    }
    $r['path'] = url_remove_dot_segments( $r['path'] );
    return join_url( $r );
}

/**
 * Filter out "." and ".." segments from a URL's path and return
 * the result.
 *
 * This function implements the "remove_dot_segments" algorithm from
 * the RFC3986 specification for URLs.
 *
 * This function supports multi-byte characters with the UTF-8 encoding,
 * per the URL specification.
 *
 * Parameters:
 *  path    the path to filter
 *
 * Return values:
 *  The filtered path with "." and ".." removed.
 */
function url_remove_dot_segments( $path )
{
    // multi-byte character explode
    $inSegs  = preg_split( '!/!u', $path );
    $outSegs = array( );
    foreach ( $inSegs as $seg )
    {
        if ( $seg == '' || $seg == '.')
            continue;
        if ( $seg == '..' )
            array_pop( $outSegs );
        else
            array_push( $outSegs, $seg );
    }
    $outPath = implode( '/', $outSegs );
    if ( $path[0] == '/' )
        $outPath = '/' . $outPath;
    // compare last multi-byte character against '/'
    if ( $outPath != '/' &&
        (mb_strlen($path)-1) == mb_strrpos( $path, '/', 'UTF-8' ) )
        $outPath .= '/';
    return $outPath;
}


/**
 * This function parses an absolute or relative URL and splits it
 * into individual components.
 *
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 * A portion of the ABNFs are repeated here:
 *
 *  URI-reference   = URI
 *          / relative-ref
 *
 *  URI     = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 *
 *  relative-ref    = relative-part [ "?" query ] [ "#" fragment ]
 *
 *  hier-part   = "//" authority path-abempty
 *          / path-absolute
 *          / path-rootless
 *          / path-empty
 *
 *  relative-part   = "//" authority path-abempty
 *          / path-absolute
 *          / path-noscheme
 *          / path-empty
 *
 *  authority   = [ userinfo "@" ] host [ ":" port ]
 *
 * So, a URL has the following major components:
 *
 *  scheme
 *      The name of a method used to interpret the rest of
 *      the URL.  Examples:  "http", "https", "mailto", "file'.
 *
 *  authority
 *      The name of the authority governing the URL's name
 *      space.  Examples:  "example.com", "user@example.com",
 *      "example.com:80", "user:password@example.com:80".
 *
 *      The authority may include a host name, port number,
 *      user name, and password.
 *
 *      The host may be a name, an IPv4 numeric address, or
 *      an IPv6 numeric address.
 *
 *  path
 *      The hierarchical path to the URL's resource.
 *      Examples:  "/index.htm", "/scripts/page.php".
 *
 *  query
 *      The data for a query.  Examples:  "?search=google.com".
 *
 *  fragment
 *      The name of a secondary resource relative to that named
 *      by the path.  Examples:  "#section1", "#header".
 *
 * An "absolute" URL must include a scheme and path.  The authority, query,
 * and fragment components are optional.
 *
 * A "relative" URL does not include a scheme and must include a path.  The
 * authority, query, and fragment components are optional.
 *
 * This function splits the $url argument into the following components
 * and returns them in an associative array.  Keys to that array include:
 *
 *  "scheme"    The scheme, such as "http".
 *  "host"      The host name, IPv4, or IPv6 address.
 *  "port"      The port number.
 *  "user"      The user name.
 *  "pass"      The user password.
 *  "path"      The path, such as a file path for "http".
 *  "query"     The query.
 *  "fragment"  The fragment.
 *
 * One or more of these may not be present, depending upon the URL.
 *
 * Optionally, the "user", "pass", "host" (if a name, not an IP address),
 * "path", "query", and "fragment" may have percent-encoded characters
 * decoded.  The "scheme" and "port" cannot include percent-encoded
 * characters and are never decoded.  Decoding occurs after the URL has
 * been parsed.
 *
 * Parameters:
 *  url     the URL to parse.
 *
 *  decode      an optional boolean flag selecting whether
 *          to decode percent encoding or not.  Default = TRUE.
 *
 * Return values:
 *  the associative array of URL parts, or FALSE if the URL is
 *  too malformed to recognize any parts.
 */
function split_url( $url, $decode=FALSE)
{
    // Character sets from RFC3986.
    $xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
    $xpchar        = $xunressub . ':@% ';

    // Scheme from RFC3986.
    $xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';

    // User info (user + password) from RFC3986.
    $xuserinfo     = '((['  . $xunressub . '%]*)' .
                     '(:([' . $xunressub . ':%]*))?)';

    // IPv4 from RFC3986 (without digit constraints).
    $xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';

    // IPv6 from RFC2732 (without digit and grouping constraints).
    $xipv6         = '(\[([a-fA-F\d.:]+)\])';

    // Host name from RFC1035.  Technically, must start with a letter.
    // Relax that restriction to better parse URL structure, then
    // leave host name validation to application.
    $xhost_name    = '([a-zA-Z\d-.%]+)';

    // Authority from RFC3986.  Skip IP future.
    $xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
    $xport         = '(\d*)';
    $xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
                 '?(:' . $xport . ')?)';

    // Path from RFC3986.  Blend absolute & relative for efficiency.
    $xslash_seg    = '(/[' . $xpchar . ']*)';
    $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
    $xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
    $xpath_abs     = '(/(' . $xpath_rel . ')?)';
    $xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
             '|' . $xpath_rel . ')';

    // Query and fragment from RFC3986.
    $xqueryfrag    = '([' . $xpchar . '/?' . ']*)';

    // URL.
    $xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
                     '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';


    // Split the URL into components.
    if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
        return FALSE;

    if ( !empty($m[2]) )        $parts['scheme']  = strtolower($m[2]);

    if ( !empty($m[7]) ) {
        if ( isset( $m[9] ) )   $parts['user']    = $m[9];
        else            $parts['user']    = '';
    }
    if ( !empty($m[10]) )       $parts['pass']    = $m[11];

    if ( !empty($m[13]) )       $h=$parts['host'] = $m[13];
    else if ( !empty($m[14]) )  $parts['host']    = $m[14];
    else if ( !empty($m[16]) )  $parts['host']    = $m[16];
    else if ( !empty( $m[5] ) ) $parts['host']    = '';
    if ( !empty($m[17]) )       $parts['port']    = $m[18];

    if ( !empty($m[19]) )       $parts['path']    = $m[19];
    else if ( !empty($m[21]) )  $parts['path']    = $m[21];
    else if ( !empty($m[25]) )  $parts['path']    = $m[25];

    if ( !empty($m[27]) )       $parts['query']   = $m[28];
    if ( !empty($m[29]) )       $parts['fragment']= $m[30];

    if ( !$decode )
        return $parts;
    if ( !empty($parts['user']) )
        $parts['user']     = rawurldecode( $parts['user'] );
    if ( !empty($parts['pass']) )
        $parts['pass']     = rawurldecode( $parts['pass'] );
    if ( !empty($parts['path']) )
        $parts['path']     = rawurldecode( $parts['path'] );
    if ( isset($h) )
        $parts['host']     = rawurldecode( $parts['host'] );
    if ( !empty($parts['query']) )
        $parts['query']    = rawurldecode( $parts['query'] );
    if ( !empty($parts['fragment']) )
        $parts['fragment'] = rawurldecode( $parts['fragment'] );
    return $parts;
}


/**
 * This function joins together URL components to form a complete URL.
 *
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 * This function implements the specification's "component recomposition"
 * algorithm for combining URI components into a full URI string.
 *
 * The $parts argument is an associative array containing zero or
 * more of the following:
 *
 *  "scheme"    The scheme, such as "http".
 *  "host"      The host name, IPv4, or IPv6 address.
 *  "port"      The port number.
 *  "user"      The user name.
 *  "pass"      The user password.
 *  "path"      The path, such as a file path for "http".
 *  "query"     The query.
 *  "fragment"  The fragment.
 *
 * The "port", "user", and "pass" values are only used when a "host"
 * is present.
 *
 * The optional $encode argument indicates if appropriate URL components
 * should be percent-encoded as they are assembled into the URL.  Encoding
 * is only applied to the "user", "pass", "host" (if a host name, not an
 * IP address), "path", "query", and "fragment" components.  The "scheme"
 * and "port" are never encoded.  When a "scheme" and "host" are both
 * present, the "path" is presumed to be hierarchical and encoding
 * processes each segment of the hierarchy separately (i.e., the slashes
 * are left alone).
 *
 * The assembled URL string is returned.
 *
 * Parameters:
 *  parts       an associative array of strings containing the
 *          individual parts of a URL.
 *
 *  encode      an optional boolean flag selecting whether
 *          to do percent encoding or not.  Default = true.
 *
 * Return values:
 *  Returns the assembled URL string.  The string is an absolute
 *  URL if a scheme is supplied, and a relative URL if not.  An
 *  empty string is returned if the $parts array does not contain
 *  any of the needed values.
 */
function join_url( $parts, $encode=FALSE)
{
    if ( $encode )
    {
        if ( isset( $parts['user'] ) )
            $parts['user']     = rawurlencode( $parts['user'] );
        if ( isset( $parts['pass'] ) )
            $parts['pass']     = rawurlencode( $parts['pass'] );
        if ( isset( $parts['host'] ) &&
            !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
            $parts['host']     = rawurlencode( $parts['host'] );
        if ( !empty( $parts['path'] ) )
            $parts['path']     = preg_replace( '!%2F!ui', '/',
                rawurlencode( $parts['path'] ) );
        if ( isset( $parts['query'] ) )
            $parts['query']    = rawurlencode( $parts['query'] );
        if ( isset( $parts['fragment'] ) )
            $parts['fragment'] = rawurlencode( $parts['fragment'] );
    }

    $url = '';
    if ( !empty( $parts['scheme'] ) )
        $url .= $parts['scheme'] . ':';
    if ( isset( $parts['host'] ) )
    {
        $url .= '//';
        if ( isset( $parts['user'] ) )
        {
            $url .= $parts['user'];
            if ( isset( $parts['pass'] ) )
                $url .= ':' . $parts['pass'];
            $url .= '@';
        }
        if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
            $url .= '[' . $parts['host'] . ']'; // IPv6
        else
            $url .= $parts['host'];         // IPv4 or name
        if ( isset( $parts['port'] ) )
            $url .= ':' . $parts['port'];
        if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
            $url .= '/';
    }
    if ( !empty( $parts['path'] ) )
        $url .= $parts['path'];
    if ( isset( $parts['query'] ) )
        $url .= '?' . $parts['query'];
    if ( isset( $parts['fragment'] ) )
        $url .= '#' . $parts['fragment'];
    return $url;
}

/**
 * This function encodes URL to form a URL which is properly 
 * percent encoded to replace disallowed characters.
 *
 * RFC3986 specifies the allowed characters in the URL as well as
 * reserved characters in the URL. This function replaces all the 
 * disallowed characters in the URL with their repective percent 
 * encodings. Already encoded characters are not encoded again,
 * such as '%20' is not encoded to '%2520'.
 *
 * Parameters:
 *  url     the url to encode.
 *
 * Return values:
 *  Returns the encoded URL string. 
 */
function encode_url($url) {
  $reserved = array(
    ":" => '!%3A!ui',
    "/" => '!%2F!ui',
    "?" => '!%3F!ui',
    "#" => '!%23!ui',
    "[" => '!%5B!ui',
    "]" => '!%5D!ui',
    "@" => '!%40!ui',
    "!" => '!%21!ui',
    "$" => '!%24!ui',
    "&" => '!%26!ui',
    "'" => '!%27!ui',
    "(" => '!%28!ui',
    ")" => '!%29!ui',
    "*" => '!%2A!ui',
    "+" => '!%2B!ui',
    "," => '!%2C!ui',
    ";" => '!%3B!ui',
    "=" => '!%3D!ui',
    "%" => '!%25!ui',
  );

  $url = rawurlencode($url);
  $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
  return $url;
}

?>

Ответы [ 2 ]

1 голос
/ 13 апреля 2011

Вы действительно не сказали, какая у вас ошибка, но, к счастью, в вашем коде есть пара ошибок. В этом блоке могут появляться ошибки:

if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
reset($comm);
if (($arr[0] * $arr[1]) > $maxsize) {   
    $maxsize = $arr[0] * $arr[1];  //compare images' sise
    $the_biggest_image = $comm;
    echo '<img src="'.$the_biggest_image.'" />'; //echo the biggest one
}
  1. Вы переписываете $arr = @getimagesize($comm);, которая является вашей переменной фильтрации "ads".
  2. Если $check_flag ложно, вы все равно делаете следующие вычислительные операторы
  3. reset() не работает со строками.
  4. Вы повторяете $the_biggest_image всякий раз, когда обновляете максимальный размер. это предназначено?

UPDATE

Попытка заставить ваш код работать и, надеюсь, немного лучше:

<?php
require_once 'simple_html_dom.php';
require 'url_to_absolute.php'; //get image absolute url
// options
$url = 'http://www.yomiuri.co.jp/stream/';
$ignore = array('ad', 'ads','gif');// add ads possible words as a arry which is check in the image url
$biggestImage = 'path to "no image found" image';
// process
$maxSize = -1;
$visited = array();
$html = file_get_html($url);
// base url
$parts=parse_url($url);
$host=$parts['scheme'].'://'.$parts['host'];
// loop images
foreach($html->find('img') as $element) {
    $pic = $element->src;
    if($pic=='')continue;// it happens on your test url
    $absUrl = url_to_absolute($host, $pic);//get image absolute url
    // ignore already seen images, add new images
    if(in_array($absUrl, $visited))continue;
    $visited[]=$absUrl;
    // remove ads images
    $ignoring=false;
    foreach($ignore as $item)
        if (stripos($absUrl,$item)!==false){
            $ignoring=true;
            break;
        }
    if($ignoring)continue;
    // get image
    $image=@getimagesize($absUrl);// get the rest images width and height
    if (($image[0] * $image[1]) > $maxSize) {   
        $maxSize = $image[0] * $image[1];  //compare images' sise
        $biggestImage = $absUrl;
    }
}
echo '<img src="'.$biggestImage.'" />'; //echo the biggest one
?>
0 голосов
/ 04 марта 2017

На основе вашего кода я создал следующее решение - оно использует ту же логику и позволяет установить минимальную ширину и высоту для изображения, чтобы убедиться, что оно возвращает правильные изображения

private function getMainImageFromUrl($pageUrl) {

    $biggestImage = '';
    $minImgWidth = 300;
    $minImgHeight = 300;
    $images = $this->getImagesFromDom($pageUrl);
    $visited = array();
    $maxSize = -1;
    $ignore = array('ad', 'ads','gif'); // get rid of ads (check if these contains following)

    foreach ($images as $image) {
        $pic = $image->getAttribute('src');
        # if source is empty, skip to another image
        if ( empty( $pic ) )
            continue;
        # get image absolute url
        $absUrl = url_to_absolute($pic);
        # ignore already seen images (skip to another), add new images
        if ( in_array( $absUrl, $visited ) )
            continue;
        $visited[] = $absUrl;
        # remove ads
        $ignoring = false;
        foreach($ignore as $item)
            if ( stripos( $absUrl,$item ) !== false ){

                $ignoring=true;
                break;

            }
        if ( $ignoring )
            continue;
        $imageSize = @getimagesize($absUrl);
        if ( ( $imageSize[0] * $imageSize[1] ) > $maxSize) {
            $maxSize = $imageSize[0] * $imageSize[1];
            if ($minImgWidth < $imageSize[0] && $minImgHeight < $imageSize[1])
                $biggestImage = $absUrl;
        }
    }
    return $biggestImage;
}

private function getImagesFromDom( $url ) {
    ini_set('default_socket_timeout', 4);
    $dom = new DOMDocument();
    @$dom->loadHTMLFile( $url );
    $dom->preserveWhiteSpace = false;

    # Get images from DOM
    return $dom->getElementsByTagName('img');
}
...