Найти номера немецкого телефона с Selenium (Java) - PullRequest
0 голосов
/ 04 апреля 2019

Я пытаюсь получить номера Telefon (немецкий формат) с немецких веб-сайтов через регулярное выражение с API-интерфейсом Selenium Web driver. Моя проблема в том, что я получаю некоторые ложные срабатывания, которые пока не могу исключить. Может ли кто-нибудь помочь мне оптимизировать регулярное выражение? Так что я точно знаю, что у меня есть 100% телефонный номер. В коде Impressum - это, как правило, имя для контактной информации, поэтому я ищу это слово «Impressum» на веб-сайте, а затем нажимаю на него и затем загружаю HTML-текст в виде строки. Затем я использую регулярное выражение для поиска телефонных номеров в теле html. Спасибо.

  public void search() {
        jse = (JavascriptExecutor) driver;
        WebElement w = driver.findElement(By.partialLinkText("mpress"));
        if (w.getText().matches("Impressum" ) || w.getText().matches("impressum")){
            w.click();
        }
        impressum.setBody(driver.findElement(By.tagName("body")).getText());   // HTML-body download
    }

    @SuppressWarnings("Duplicates")
    public void TelRegex() {
        final String regex = "([\\+][0-9]{1,3} [ \\.\\-\\/])?  ([\\(][0-9]{1,6}[\\)])?  ([0-9 \\.\\-\\/]{10,20})$";
        final String string = impressum.getBody();

        final Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.COMMENTS);
        final Matcher matcher = pattern.matcher(string);

        while (matcher.find()) {
            System.out.println("Full match: " + matcher.group(0));
        }
    }

В коде Impressum - это обычно имя для контактной информации, поэтому я ищу это слово «Impressum» на веб-сайте, а затем нажимаю на него, а затем загружаю HTML-текст в виде строки. Затем я использую регулярное выражение для поиска телефонных номеров в теле html. Он дает мне номера телефонов, но иногда есть и другие номера, которые не являются номерами телефонов.

Ответы [ 2 ]

0 голосов
/ 15 апреля 2019

Извлечение номера телефона на основе префиксов:

public void extractAllPhoneNumbers() {
    ArrayList<String> phoneNumbers = new ArrayList<String>();

    driver.get("https://www.vario-doser.de/");
    WebElement impressumLink = waitSec(driver, 5).until(ExpectedConditions.elementToBeClickable(By.xpath("//a[@href='ueber-uns/impressum/']")));
    impressumLink.click();
    WebElement content = waitSec(driver, 5).until(ExpectedConditions.elementToBeClickable(By.id("content")));
    String[] contentText = content.getText().split("\\n");

    String[] prefixes = {"0180 / ", "09721 / "};

    for (String line: contentText) {
        for (String prefix: prefixes) {
            if (line.contains(prefix)) {
                phoneNumbers.add(line);
                System.out.println("Extracting: " + line.split(prefix)[1]);
            }
            else {
                System.out.println("Textline does not contain any of the prefixes.");
            }
        }
    }
    if (phoneNumbers.size() > 0) {
        System.out.println("Extracted phone numbers:");
        for (String phoneNumber: phoneNumbers) {
            System.out.println(phoneNumber);
        }
    }
    else {
        System.out.println("No phone number found.");
    }

}

Но включает в себя факс.

...
Textline does not contain any of the prefixes.
Extracted phone numbers:
Tel.: 09721 / 533404
Fax: 09721 / 533405
Tel: 0180 / 60 05 85 0
0 голосов
/ 15 апреля 2019

Класс:

package syed;

import java.util.ArrayList;
import java.util.Objects;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;

public class Syed {
    private static WebDriver driver;    

    @BeforeClass
    public static void setUpClass() {       
        System.setProperty("webdriver.chrome.driver", "C:\\Users\\pburgr\\Desktop\\selenium-tests\\GCH_driver\\chromedriver.exe");
        ChromeOptions options = new ChromeOptions();
        options.addArguments("user-data-dir=C:\\Users\\pburgr\\AppData\\Local\\Google\\Chrome\\User Data");
        driver = new ChromeDriver(options);
        driver.manage().window().maximize();        
    }
    @Before public void setUp() {} @After public void tearDown() {}
    @AfterClass public static void tearDownClass() {
        driver.quit();
    }
    @Test
    public void extractAllPhoneNumbers() {
        ArrayList<String> phoneNumbers = new ArrayList<String>();

        driver.get("https://www.vario-doser.de/");
        WebElement impressumLink = waitSec(driver, 5).until(ExpectedConditions.elementToBeClickable(By.xpath("//a[@href='ueber-uns/impressum/']")));
        impressumLink.click();
        WebElement content = waitSec(driver, 5).until(ExpectedConditions.elementToBeClickable(By.id("content")));
        String[] contentText = content.getText().split("\\n");

        for (String line: contentText) {
            if (line.length() > 0 && Objects.equals(line.substring(0, 3), "Tel")) {
                phoneNumbers.add(line);
                System.out.println("Extracting: " + line);
            }
            else {
                System.out.println("Textline does not beginn with 'Tel'");
            }
        }
        if (phoneNumbers.size() > 0) {
            System.out.println("Extracted phone numbers:");
            for (String phoneNumber: phoneNumbers) {
                System.out.println(phoneNumber);
            }
        }
        else {
            System.out.println("No phone number found.");
        }

    }   
    public WebDriverWait waitSec(WebDriver driver, int sec) {
        return new WebDriverWait(driver, sec);
    }
}

Выход:

Starting ChromeDriver 2.42.591088 (7b2b2dca23cca0862f674758c9a3933e685c27d5) on port 3253
Only local connections are allowed.
Dub 15, 2019 9:46:23 DOP. org.openqa.selenium.remote.ProtocolHandshake createSession
INFO: Detected dialect: OSS

Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Extracting: Tel.: 09721 / 533404
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Extracting: Tel: 0180 / 60 05 85 0
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Extracted phone numbers:
Tel.: 09721 / 533404
Tel: 0180 / 60 05 85 0

Это то, что вам нужно?

...