Я уничтожаю таблицы с этой платформы .После получения всей информации я успешно сохраняю их в список.Следующее, что я хочу сделать, это отфильтровать единицу измерения (км2) из списка, содержащего записи.
Например:
Площадь водосбора имеет размер 0,1 км2, я хочу удалить км2 для всех записейв этом столбце
Подход, который я использую:
# -*- coding utf-8 -*-
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
import time
import os
import shutil
from bs4 import BeautifulSoup
import uuid
import csv
import dateutil.parser as parser
import pandas as pd
from selenium.webdriver.support.select import Select
class crawlOcean():
def __init__(self):
print("hurray33")
global downloadDir
global uFileName
global filname
downloadDir = ""
uFileName = str(uuid.uuid4())
filname = downloadDir + uFileName + ".csv"
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList", 2)
fp.set_preference("browser.download.manager.showWhenStarting", False)
fp.set_preference("browser.download.dir", downloadDir)
fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
"attachment/csv")
options = Options()
options.add_argument("--headless")
self.driver = webdriver.Firefox(firefox_profile=fp)
#self.driver = webdriver.Firefox()
print("hurray")
self.driver.implicitly_wait(15)
self.driver.get("http://www.epa.ie/hydronet/#Water%20Levels")
self.verificationErrors = []
self.accept_next_alert = True
def crawl(self):
print("see")
driver = self.driver
driver.execute_script("window.scrollTo(0, 800)")
driver.find_element_by_id("dijit_MenuItem_3_text").click()
driver.find_element_by_xpath('//td[.="All"]').click()
driver.find_element_by_xpath('//td[.="Active EPA/LA (239)"]').click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
headers = []
for m in soup.find_all("th"):
headers.append(m.get_text())
print(headers)
content = []
finalContent = []
filterList = ["km²"]
for table in soup.find_all("table")[5::]:
for row in table.find_all("tr"):
contentCells = []
for cells in row.find_all("td"):
if cells.text.split()not in filterList:
contentCells.append(cells.text)
content.append(contentCells)
print(content)
with open(filname, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(content)
driver.close()
if __name__ == '__main__':
obj = crawlOcean()
obj.crawl()
Пример вывода:
['Station number', 'Station name', 'Waterbody', 'Status', 'Type of Gauge', 'Catchment area', 'Data Provider', 'River Basin', 'Timestamp', 'Value', 'Unit']
[['14107', 'BAYLOUGH BR.', 'BARROW', 'Active', 'Recorder', '431.50 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 09:00', '58.419', 'm OD Malin (OSGM02)'], ['18118', 'SHANBALLYMORE', 'SPRING', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Blackwater (Munster)', '25-10-2018 09:00', '0.432', '---'], ['14108', 'BALLYNAFAGH', 'STREAM', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 09:00', '84.531', 'm OD Malin (OSGM02)'], ['14104', 'GREESEMOUNT', 'GREESE', 'Active', 'Recorder', '74.60 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 08:00', '90.877', 'm OD Malin (OSGM02)'], ['14100', 'KYLE SPRING', 'SPRING', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Barrow', '17-10-2018 14:15', '91.595', 'm OD Malin (OSGM02)'], ['03059', 'DRUMULLY BR.', 'MOUNTAIN WATER', 'Active', 'Staff Gauge Only', '0.00 km²', 'Environmental Protection Agency', 'Blackwater (Ulster)', '01-10-2018 12:45', '100.280', 'm (TBM)'], ['03070', 'EMY LOUGH', 'EMY LOUGH', 'Active', 'Recorder', '', 'Environmental Protection Agency', 'Blackwater (Ulster)', '25-10-2018 09:00', '50.884', 'm OD (Poolbeg)'], ['03057', 'EMYVALE WEIR', 'MOUNTAIN WATER', 'Active', 'Recorder', '37.80 km²', 'Environmental Protection Agency', 'Blackwater (Ulster)', '25-10-2018 09:00', '52.431', 'm OD Malin (OSGM02)'], ['18048', 'DROMCUMMER', 'BLACKWATER [MUNSTER]', 'Active', 'Recorder', '867.70 km²', 'Environmental Protection Agency', 'Blackwater (Munster)', '25-10-2018 01:00', '60.605', 'm OD Malin (OSGM02)'], ['14057', 'TIMOLIN', 'BOTHOGUE', 'Active', 'Recorder', '18.20 km²', 'Environmental Protection Agency', 'Barrow', '25-10-2018 09:00', '89.953', 'm OD Malin (OSGM02)'], ['28011', 'DOO LOUGH OUTFLOW', '---', 'Active', 'Recorder', '22.81 km²', 'Environmental Protection Agency', 'Annageeragh-Annagh-Creegh', '25-10-2018 09:00', '80.743', 'm OD Malin (OSGM02)'], ['07071', 'SKEAGH', 'SKEAGH L.', 'Active', 'Recorder', '5.30 km²', 'Environmental Protection Agency', 'Boyne', '15-08-2018 14:00', '149.503', 'm OD (Poolbeg)'], ['03051', 'FAULKLAND', 'BLACKWATER (MONAGHAN)', 'Active', 'Recorder', '143.20 km²', 'Environmental Protection Agency', 'Blackwater (Ulster)', '19-09-2018 17:00', '41.572', 'm OD Malin (OSGM02)'], ['07077', 'WHITE LOUGH.', 'ANNAGH OR WHITE LOUGH', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Boyne', '05-09-2018 13:30', '105.169', 'm OD Malin (OSGM02)'], ['10038', 'DRUIDS GLEN', 'STREAM', 'Active', 'Recorder', '16.00 km²', 'Environmental Protection Agency', 'Potters-Redcross-Three Mile Water', '25-10-2018 09:00', '19.377', 'm OD Malin (OSGM02)'], ['36031', 'LISDARN', 'CAVAN', 'Active', 'Recorder', '63.80 km²', 'Environmental Protection Agency', 'Erne', '25-10-2018 09:00', '52.317', 'm OD Malin (OSGM02)'], ['07074', 'BALLANY', 'LENE L.', 'Active', 'Recorder', '13.00 km²', 'Environmental Protection Agency', 'Boyne', '25-10-2018 09:00', '92.667', 'm OD Malin (OSGM02)'], ['16047', 'CARROWCLOGH', 'ARA', 'Active', 'Recorder', '44.10 km²', 'Environmental Protection Agency', 'Suir', '25-10-2018 08:00', '80.820', 'm OD Malin (OSGM02)'], ['30012', 'CLAREGALWAY', 'CLARE', 'Active', 'Recorder', '1072.90 km²', 'Environmental Protection Agency', 'Corrib', '24-10-2018 20:30', '6.355', 'm OD Malin (OSGM02)'], ['16045', 'BALLYSHONOCK RESRVR.', 'DAWN', 'Active', 'Recorder', '5.20 km²', 'Environmental Protection Agency', 'Suir', '09-08-2018 13:15', '89.683', 'm OD Malin (OSGM02)'], ['07078', 'LOUGH BANE', 'BANE L.', 'Active', 'Recorder', '', 'Environmental Protection Agency', '---', '25-10-2018 09:00', '111.051', 'm OD Malin (OSGM02)'], ['10028', 'KNOCKNAMOHILL', 'AUGHRIM', 'Active', 'Recorder', '203.00 km²', 'Environmental Protection Agency', 'Avoca', '24-10-2018 20:00', '21.322', 'm OD Malin (OSGM02)'], ['29071', 'CUTRA', 'L. CUTRA', 'Active', 'Recorder', '123.80 km²', 'Environmental Protection Agency', 'Kinvarra', '25-10-2018 08:15', '32.694', 'm OD Malin (OSGM02)'], ['25070', 'WHITEBRIDGE.', 'L.ENNELL', 'Active', 'Recorder', '147.80 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '78.934', 'm OD Malin (OSGM15)'], ['29018', 'CLARINBRIDGE SPRING', 'SPRING', 'Active', 'Recorder', '', 'Environmental Protection Agency', 'Kilcogan', '27-09-2018 12:30', '2.926', 'm OD Malin (OSGM02)'], ['25072', 'CAPTAIN S HILL', 'OWEL L.', 'Active', 'Recorder', '22.60 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '96.146', 'm OD Malin (OSGM15)'], ['10021', 'COMMON S ROAD', 'SHANGANAGH', 'Active', 'Recorder', '32.50 km²', 'Environmental Protection Agency', 'Loughlinstown', '25-10-2018 08:00', '10.806', 'm OD (Poolbeg)'], ['26204', 'BALLYMARTIN', 'HIND', 'Active', 'Recorder', '44.70 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '40.009', 'm OD Malin (OSGM02)'], ['33001', 'GLENAMOY', 'GLENAMOY', 'Active', 'Recorder', '76.10 km²', 'Environmental Protection Agency', 'Glenamoy-Ballinglen-Glencullen', '25-10-2018 09:00', '3.901', 'm OD Malin (OSGM15)'], ['22031', 'KILLARNEY SW (New)', 'L. LEANE TRIB', 'Active', 'Recorder', '0.06 km²', 'Environmental Protection Agency', 'Laune', '21-08-2018 11:15', '19.570', 'm OD Malin (OSGM02)'], ['32076', 'DOO LOUGH', 'DOO LOUGH [MAYO]', 'Active', 'Recorder', '0.00 km²', 'Environmental Protection Agency', 'Owenglin-Dawros-Culin-Traheen', '03-09-2018 11:30', '30.237', 'm OD Malin (OSGM02)'], ['32073', 'LETTERETTRIM', 'L. FEE', 'Active', 'Recorder', '15.70 km²', 'Environmental Protection Agency', 'Carrownisky-Owenwee-Carrowbeg', '25-10-2018 09:00', '44.589', 'm OD Malin (OSGM15)'], ['32070', 'L.FEEAGH', 'L. FEEAGH', 'Active', 'Recorder', '84.30 km²', 'Environmental Protection Agency', 'Srahmore', '25-10-2018 09:00', '10.852', 'm OD Malin (OSGM15)'], ['32026', 'BUNDORRAGHA', 'BUNDORRAGHA', 'Active', 'Recorder', '48.30 km²', 'Environmental Protection Agency', 'Owenglin-Dawros-Culin-Traheen', '25-10-2018 09:00', '3.358', 'm OD Malin (OSGM15)'], ['25046', 'LISMOYNY', 'BROSNA', 'Active', 'Recorder', '304.50 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 09:00', '57.021', 'm OD Malin (OSGM15)'], ['35073', 'L.GILL', 'L.GILL', 'Active', 'Recorder', '362.60 km²', 'Environmental Protection Agency', 'Garvogue', '25-10-2018 09:00', '3.823', 'm OD Malin (OSGM15)'], ['25044', 'COOLE', 'KILMASTULLA', 'Active', 'Recorder', '92.54 km²', 'Environmental Protection Agency', 'Shannon', '25-10-2018 08:15', '29.649', 'm OD Malin (OSGM02)'], ['35072', 'TRASGARVE', 'L. EASKY', 'Active', 'Recorder', '10.70 km²', 'Environmental Protection Agency', 'Easky-Dunneil', '06-09-2018 10:45', '183.646', 'm OD Malin (OSGM02)']
Кто-нибудь поможет мне следовать правильному подходу?