Я пытаюсь собрать и упорядочить определенные c поля данных, полученных из веб-скребков с R. Я загрузил полную страницу в R как фрейм данных, но я только хочу специально собрать и затем организовать числовые значения между совпадениями регулярных выражений, если они должны появиться в новом фрейме данных.
Поля, которые я хотел бы собирать исключительно, представляют собой числовые значения, встречающиеся между следующим образцом шаблона регулярных выражений:
27.10
В некоторых случаях может быть несколько экземпляров этого в этом случае я хотел бы связать эти значения вместе во фрейм данных. Если бы это был всего лишь один экземпляр, я бы хотел оставить одну строку в фрейме данных.
Вот мой пример сценария и dput структуры образца данных:
form4_2 <- as.character(unlist(df, use.names=FALSE))
form4_2 <- str_trim(form4_2)
which(str_detect(form4_2,"<transactionPricePerShare>")=='TRUE')
which(str_detect(form4_2,"</transactionPricePerShare>")=='TRUE')
# Collect Transaction Price
form4_3 <- form4_2[which(str_detect(form4_2,"</transactionPricePerShare>")=='TRUE')-1]
form4_3 <- str_remove(form4_3,'<value>')
form4_3 <- str_remove(form4_3,'</value>')
form4_4 <- data.frame(as.numeric(form4_3))
colnames(form4_4) <- "Transacted Price ($)"
структура df:
structure(list(V1 = structure(c(124L, 125L, 119L, 130L, 132L,
136L, 131L, 134L, 133L, 137L, 111L, 91L, 87L, 109L, 95L, 97L,
98L, 93L, 110L, 104L, 89L, 102L, 105L, 135L, 108L, 92L, 88L,
99L, 96L, 100L, 94L, 107L, 103L, 90L, 101L, 106L, 86L, 110L,
103L, 90L, 101L, 106L, 115L, 121L, 128L, 126L, 122L, 120L, 127L,
129L, 118L, 123L, 85L, 77L, 83L, 81L, 79L, 62L, 63L, 64L, 72L,
84L, 67L, 44L, 46L, 58L, 66L, 49L, 50L, 45L, 47L, 51L, 48L, 57L,
68L, 36L, 37L, 39L, 38L, 40L, 41L, 59L, 75L, 80L, 65L, 52L, 29L,
32L, 55L, 28L, 35L, 54L, 25L, 24L, 19L, 34L, 53L, 27L, 6L, 17L,
26L, 5L, 1L, 16L, 23L, 10L, 15L, 33L, 43L, 22L, 3L, 14L, 31L,
42L, 18L, 11L, 12L, 21L, 9L, 13L, 30L, 56L, 65L, 52L, 29L, 32L,
55L, 28L, 35L, 54L, 25L, 24L, 19L, 20L, 34L, 53L, 27L, 7L, 17L,
26L, 4L, 2L, 16L, 23L, 10L, 15L, 33L, 43L, 22L, 8L, 14L, 31L,
42L, 18L, 11L, 12L, 21L, 9L, 13L, 30L, 56L, 73L, 76L, 78L, 60L,
61L, 71L, 82L, 70L, 69L, 74L, 113L, 117L, 116L, 112L, 114L), .Label = c(" <footnoteId id=F1/>",
" <footnoteId id=F2/>", " <value>1206005</value>",
" <value>27.10</value>", " <value>31.84</value>",
" <value>44771</value>", " <value>600000</value>",
" <value>606005</value>", " <value>By Ronin Trading, LLC</value>",
" <value>D</value>", " <value>I</value>",
" </directOrIndirectOwnership>", " </natureOfOwnership>",
" </sharesOwnedFollowingTransaction>", " </transactionAcquiredDisposedCode>",
" </transactionPricePerShare>", " </transactionShares>",
" <directOrIndirectOwnership>", " <equitySwapInvolved>0</equitySwapInvolved>",
" <footnoteId id=F2/>", " <natureOfOwnership>",
" <sharesOwnedFollowingTransaction>", " <transactionAcquiredDisposedCode>",
" <transactionCode>S</transactionCode>", " <transactionFormType>4</transactionFormType>",
" <transactionPricePerShare>", " <transactionShares>",
" <value>2020-04-06</value>", " <value>Common Stock</value>",
" </ownershipNature>", " </postTransactionAmounts>",
" </securityTitle>", " </transactionAmounts>",
" </transactionCoding>", " </transactionDate>",
" <isDirector>0</isDirector>", " <isOfficer>0</isOfficer>",
" <isOther>0</isOther>", " <isTenPercentOwner>1</isTenPercentOwner>",
" <officerTitle></officerTitle>", " <otherText></otherText>",
" <ownershipNature>", " <postTransactionAmounts>",
" <rptOwnerCik>0001218981</rptOwnerCik>", " <rptOwnerCity>CHICAGO</rptOwnerCity>",
" <rptOwnerName>STAFFORD JOHN S III</rptOwnerName>",
" <rptOwnerState>IL</rptOwnerState>", " <rptOwnerStateDescription></rptOwnerStateDescription>",
" <rptOwnerStreet1>350 N. ORLEANS STREET</rptOwnerStreet1>",
" <rptOwnerStreet2>SUITE 2N</rptOwnerStreet2>", " <rptOwnerZipCode>60654-1975</rptOwnerZipCode>",
" <securityTitle>", " <transactionAmounts>",
" <transactionCoding>", " <transactionDate>",
" </nonDerivativeTransaction>", " </reportingOwnerAddress>",
" </reportingOwnerId>", " </reportingOwnerRelationship>",
" <footnote id=F1>This transaction was executed in multiple trades at prices ranging from $31.48 to $32.30. The price reported above reflects the weighted average purchase price. The reporting person hereby undertakes to provide upon request to the SEC staff, the issuer or a security holder of the issuer full information regarding the number of shares and prices at which the transactions were effected.</footnote>",
" <footnote id=F2>The transaction was executed in a single, privately negotiated transaction with an institutional buyer.</footnote>",
" <issuerCik>0001326732</issuerCik>", " <issuerName>Xencor Inc</issuerName>",
" <issuerTradingSymbol>XNCR</issuerTradingSymbol>", " <nonDerivativeTransaction>",
" <reportingOwnerAddress>", " <reportingOwnerId>",
" <reportingOwnerRelationship>", " <signatureDate>2020-04-08</signatureDate>",
" <signatureName>/s/ John S. Stafford, III</signatureName>",
" </footnotes>", " </issuer>", " </nonDerivativeTable>",
" </ownerSignature>", " </reportingOwner>", " <derivativeTable></derivativeTable>",
" <documentType>4</documentType>", " <footnotes>", " <issuer>",
" <nonDerivativeTable>", " <notSubjectToSection16>1</notSubjectToSection16>",
" <ownerSignature>", " <periodOfReport>2020-04-06</periodOfReport>",
" <reportingOwner>", " <schemaVersion>X0306</schemaVersion>",
"\t\tBUSINESS PHONE:\t\t626-305-5900", "\t\tCENTRAL INDEX KEY:\t\t\t0001218981",
"\t\tCENTRAL INDEX KEY:\t\t\t0001326732", "\t\tCITY:\t\t\tCHICAGO",
"\t\tCITY:\t\t\tMONROVIA", "\t\tCOMPANY CONFORMED NAME:\t\t\tSTAFFORD JOHN S III",
"\t\tCOMPANY CONFORMED NAME:\t\t\tXencor Inc", "\t\tFILM NUMBER:\t\t20782220",
"\t\tFISCAL YEAR END:\t\t\t1231", "\t\tFORM TYPE:\t\t4", "\t\tIRS NUMBER:\t\t\t\t201622502",
"\t\tSEC ACT:\t\t1934 Act", "\t\tSEC FILE NUMBER:\t001-36182",
"\t\tSTANDARD INDUSTRIAL CLASSIFICATION:\tPHARMACEUTICAL PREPARATIONS [2834]",
"\t\tSTATE OF INCORPORATION:\t\t\tDE", "\t\tSTATE:\t\t\tCA",
"\t\tSTATE:\t\t\tIL", "\t\tSTREET 1:\t\t111 WEST LEMON AVE",
"\t\tSTREET 1:\t\t230 SOUTH LASALLE STREET 400", "\t\tZIP:\t\t\t60604",
"\t\tZIP:\t\t\t91016", "\tBUSINESS ADDRESS:\t", "\tCOMPANY DATA:\t",
"\tFILING VALUES:", "\tMAIL ADDRESS:\t", "\tOWNER DATA:\t", "</DOCUMENT>",
"</ownershipDocument>", "</SEC-DOCUMENT>", "</SEC-HEADER>", "</TEXT>",
"</XML>", "<?xml version=1.0?>", "<ACCEPTANCE-DATETIME>20200408162604",
"<DESCRIPTION>FORM 4 -", "<DOCUMENT>", "<FILENAME>edgar.xml",
"<ownershipDocument>", "<SEC-DOCUMENT>0001179110-20-004802.txt : 20200408",
"<SEC-HEADER>0001179110-20-004802.hdr.sgml : 20200408", "<SEQUENCE>1",
"<TEXT>", "<TYPE>4", "<XML>", "ACCESSION NUMBER:\t\t0001179110-20-004802",
"CONFORMED PERIOD OF REPORT:\t20200406", "CONFORMED SUBMISSION TYPE:\t4",
"DATE AS OF CHANGE:\t\t20200408", "FILED AS OF DATE:\t\t20200408",
"ISSUER:\t\t", "PUBLIC DOCUMENT COUNT:\t\t1", "REPORTING-OWNER:\t"
), class = "factor")), class = "data.frame", row.names = c(NA,
-176L))
Помощь приветствуется! Спасибо!