Я очищаю этот сайт в node js и peppeteer . Проблема находится в HTML - приходит, что, я думаю, нарушает HTML, поэтому я не смог получить все HTML, которые я хочу.
let css = '.the-leaderboard.with-rolex > table.leaderboard.leaderboard-table.large ~ div';
let divs = [...document.querySelectorAll(css)]
var parser = new DOMParser();
var $ = parser.parseFromString(divs[0].innerHTML, 'text/html')
var tbody = $.querySelectorAll('table > tbody')
tbody[0].outerHTML //==========ISSUE=====================
Если вы go на справочный сайт и вставите приведенный выше код, вы получите следующее HTML ( Раздел 2 )
<tbody>
<tr class="line-row line-row-28089 even">
<td class="my-favourite"><span class="icon-fav add"><span class="favorite-tooltip">Add player to<br><strong>My
Leaderboard</strong></span></span></td>
<td class="position">T1</td>
<td class="position-movement">
<div class="position-movement">--</div>
</td>
<td class="country hidden-medium"><span class="flag flag-24x24 AUS"></span></td>
<td class="player-name">
<div class="player-name-inner"><span class="flag flag-24x24 AUS visible-medium"></span>
<div class="player-name-col">Jason Day </div>
<div class="player-media-col media-icons"><span class="icon article icon-article"></span><span
class="icon regular-video-icon"></span></div>
</div>
</td>
//=============ISSUE===============
//below td missing when you parse var $$ =
//parser.parseFromString(tbody[0].innerHTML, 'text/html')
<td class="total">-5</td>
<td class="thru">F*</td>
<td class="round">-5</td>
<td class="round-x">65</td>
<td class="round-x">--</td>
<td class="round-x">--</td>
<td class="round-x">--</td>
<td class="strokes">65</td>
<td class="rank-proj">13</td>
<td class="rank-starting">51</td>
<td class="rank-movement"><span class="icon-movement up"></span>38</td>
<td class="color-selector"><span class="colorpicker"></span></td>
</tr>
<tr class="pbp-line even">
<td class="my-favourite"> </td>
<td class="position"> </td>
<td class="position-movement"> </td>
<td class="country"> </td>
<td class="pbp-text" colspan="15">Round complete.</td>
</tr>
Проблема
при синтаксическом анализе tbody [0] .outer HTML это даст вам html, в котором отсутствуют строки tr и td изображение
var $$ = parser.parseFromString(tbody[0].innerHTML, 'text/html')
Хочу
Раздел 2 HTML в $$
var $$ = parser.parseFromString(tbody[0].innerHTML, 'text/html')
Код
'use strict';
const puppeteer = require('puppeteer');
function run() {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch({
headless : false
});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.setViewportSize({ width: 1200, height: 1000 })
// add header for the navigation requests
page.on('request', request => {
// Do nothing in case of non-navigation requests.
if (!request.isNavigationRequest()) {
request.continue();
return;
}
// Add a new header for navigation request.
const headers = request.headers();
// headers['proxy'] = super_proxy;
request.continue({ headers });
});
await page.goto("https://www.pgatour.com/leaderboard.html");
await page.evaluate(`window.scrollTo(0, document.body.scrollHeight)`);
await page.waitFor(5000);
let urls = await page.evaluate(() => {
let results = [];
var parser = new DOMParser();
let css = '.the-leaderboard.with-rolex > table.leaderboard.leaderboard-table.large ~ div';
let divs = [...document.querySelectorAll(css)]
// divs gives 5 nodes that's why I only test zero index
var parser = new DOMParser();
var $ = parser.parseFromString(divs[0].innerHTML, 'text/html');
var tbody = $.querySelectorAll('table > tbody');
tbody[0].outerHTML;
})
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);