Я не знаю, есть ли библиотека Apache, однако я использую htmlunit для сканирования страницы и всех ее подстраниц с кодом следующим образом.Затем можно выполнить загрузку через URLConnection, см., Например, эту страницу .
public static void walkAllHtmlPages(final String startURL) throws IOException, SAXException {
final WebClient webClient = createWebClient();
try {
final HtmlPage page = webClient.getPage(startURL);
try {
Set visitedURLs = new HashSet();
List links = page.getAnchors();
// now recursively walk all pages
recursivelyFollowLinks(webClient, links, visitedURLs);
} finally {
if(page != null) {
page.cleanUp();
}
}
} finally {
webClient.closeAllWindows();
}
}
public static WebClient createWebClient() {
final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_3_6);
webClient.setTimeout(30000);
webClient.setJavaScriptEnabled(false);
webClient.setCssEnabled(true);
webClient.setAppletEnabled(true);
webClient.setRedirectEnabled(true); // follow old-school HTTP 302 redirects - standard behaviour
webClient.setHTMLParserListener(null);
webClient.setIncorrectnessListener(new IncorrectnessListener() {
@Override
public void notify(String message, Object origin) {
// Swallow for now, but maybe collect it for optional retrieval?
}
});
webClient.setCssErrorHandler(new SilentCssErrorHandler());
return webClient;
}
private static void recursivelyFollowLinks(WebClient webClient, List links, Set visitedURLs) throws SAXException, IOException {
try {
for(HtmlAnchor link : links) {
String url = link.getHrefAttribute();
if (!visitedURLs.contains(url)) {
visitedURLs.add(url);
visitSubLink(webClient, visitedURLs, link, url);
}
}
} catch (RuntimeException e) {
throw new IllegalArgumentException("While retrieving links: " + getLinksAsString(links), e);
}
}
private static void visitSubLink(WebClient webClient,
Set visitedURLs, HtmlAnchor link, String url) throws IOException, SAXException {
URL current = link.getPage().getUrl();
try {
HtmlPage ret = (HtmlPage)link.click();
List sublinks = ret.getAnchors();
recursivelyFollowLinks(webClient, sublinks, visitedURLs);
} catch (RuntimeException e) { // NOPMD
throw new RuntimeException("While clicking link: " + link.getId() + " to " + url, e);
}
}