rvest - удалить абзацы из нежелательного текста - PullRequest
0 голосов
/ 08 февраля 2019

Я хотел бы удалить много абзацев из вывода кода rvest ниже

 library(rvest)

    link <- 'https://www.duedil.com/company/gb/02666908/yorwaste-limited/financials'
    doc <- read_html(link)  %>% html_nodes("script:contains('js-financials-component')") %>% html_text()

По сути, я хотел бы оставить только текст, ниже которого я хочу вывести его в формате json.Я не знаю, где я могу начать с этого, пожалуйста, кто-то может посоветовать мне?Большое спасибо

{
          companyName: {"name":"Yorwaste Limited"},
          numAccounts: 1,
          accounts: [{"title":"Summary","rows":[{"label":"Reporting Period (Months)","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":12,"formattedValue":"12","delta":null}]},{"label":"Consolidated Accounts","description":"Financial statements of the parent (company) and its subsidiaries are presented as those of a single economic entity.","chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":"Y","formattedValue":"Y","delta":null}]},{"label":"Number of Employees","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":318,"formattedValue":"318","delta":64.77}]},{"label":"Turnover","description":"Revenue generated from business activities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40328232,"formattedValue":"40,328,232","delta":3.67}]},{"label":"EBITDA","description":"Earnings before interest, tax, depreciation and amortization.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":2834874,"formattedValue":"2,834,874","delta":62.78}]},{"label":"Post-tax Profit","description":"Profit generated after taxation.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":885230,"formattedValue":"885,230","delta":52.17}]},{"label":"Total Assets","description":"The value of all assets on the Balance Sheet.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40264952,"formattedValue":"40,264,952","delta":25.58}]},{"label":"Net Assets","description":"Total Assets less Total Liabilities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":10613963,"formattedValue":"10,613,963","delta":9.1}]},{"label":"Return on Capital Employed (%)","description":"Operating Profit expressed as a percentage of average Capital Employed.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":6.97,"formattedValue":"6.97","delta":null}]},{"label":"Debt to Capital (%)","description":"Total Liabilities expressed as a percentage of Total Assets.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":73.64,"formattedValue":"73.64","delta":null}]}]}],
          growth: [{"title":"Summary","rows":[{"label":"Reporting Period (Months)","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":12,"formattedValue":"12"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Consolidated Accounts","description":"Financial statements of the parent (company) and its subsidiaries are presented as those of a single economic entity.","chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":"Y","formattedValue":"Y"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Number of Employees","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":318,"formattedValue":"318"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Turnover","description":"Revenue generated from business activities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40328232,"formattedValue":"40,328,232"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"EBITDA","description":"Earnings before interest, tax, depreciation and amortization.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":2834874,"formattedValue":"2,834,874"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Post-tax Profit","description":"Profit generated after taxation.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":885230,"formattedValue":"885,230"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Total Assets","description":"The value of all assets on the Balance Sheet.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40264952,"formattedValue":"40,264,952"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Net Assets","description":"Total Assets less Total Liabilities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":10613963,"formattedValue":"10,613,963"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Return on Capital Employed (%)","description":"Operating Profit expressed as a percentage of average Capital Employed.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":6.97,"formattedValue":"6.97"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Debt to Capital (%)","description":"Total Liabilities expressed as a percentage of Total Assets.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":73.64,"formattedValue":"73.64"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}}]}]

1 Ответ

0 голосов
/ 09 февраля 2019

Вы можете просто загрузить исходный контент веб-страницы и извлечь необходимый фрагмент с помощью регулярного выражения:

library(httr)
library(stringr)

r <- GET('https://www.duedil.com/company/gb/02666908/yorwaste-limited/financials')
q <- str_match(r, "new Widget\\([\\s\\S]*?(\\{\\s*companyName:[\\s\\S]*?\\})\\)")
d <- q[2]

Disclamer для анализа HTML с помощью регулярного выражения

...