Извлечение текста и чисел на основе регулярных выражений из веб-Scape для привязки к новому чистому фрейму данных - PullRequest
0 голосов
/ 27 апреля 2020

Я пытаюсь собрать и упорядочить определенные c поля данных, полученных из веб-скребков с R. Я загрузил полную страницу в R как фрейм данных, но я только хочу специально собрать и затем организовать числовые значения между совпадениями регулярных выражений, если они должны появиться в новом фрейме данных.

Поля, которые я хотел бы собирать исключительно, представляют собой числовые значения, встречающиеся между следующим образцом шаблона регулярных выражений:

27.10

В некоторых случаях может быть несколько экземпляров этого в этом случае я хотел бы связать эти значения вместе во фрейм данных. Если бы это был всего лишь один экземпляр, я бы хотел оставить одну строку в фрейме данных.

Вот мой пример сценария и dput структуры образца данных:

form4_2 <- as.character(unlist(df, use.names=FALSE))
form4_2 <- str_trim(form4_2)

which(str_detect(form4_2,"<transactionPricePerShare>")=='TRUE')
which(str_detect(form4_2,"</transactionPricePerShare>")=='TRUE')
# Collect Transaction Price 
form4_3 <- form4_2[which(str_detect(form4_2,"</transactionPricePerShare>")=='TRUE')-1]
form4_3 <- str_remove(form4_3,'<value>')
form4_3 <- str_remove(form4_3,'</value>')
form4_4 <- data.frame(as.numeric(form4_3))
colnames(form4_4) <- "Transacted Price ($)"

структура df:

structure(list(V1 = structure(c(124L, 125L, 119L, 130L, 132L, 
136L, 131L, 134L, 133L, 137L, 111L, 91L, 87L, 109L, 95L, 97L, 
98L, 93L, 110L, 104L, 89L, 102L, 105L, 135L, 108L, 92L, 88L, 
99L, 96L, 100L, 94L, 107L, 103L, 90L, 101L, 106L, 86L, 110L, 
103L, 90L, 101L, 106L, 115L, 121L, 128L, 126L, 122L, 120L, 127L, 
129L, 118L, 123L, 85L, 77L, 83L, 81L, 79L, 62L, 63L, 64L, 72L, 
84L, 67L, 44L, 46L, 58L, 66L, 49L, 50L, 45L, 47L, 51L, 48L, 57L, 
68L, 36L, 37L, 39L, 38L, 40L, 41L, 59L, 75L, 80L, 65L, 52L, 29L, 
32L, 55L, 28L, 35L, 54L, 25L, 24L, 19L, 34L, 53L, 27L, 6L, 17L, 
26L, 5L, 1L, 16L, 23L, 10L, 15L, 33L, 43L, 22L, 3L, 14L, 31L, 
42L, 18L, 11L, 12L, 21L, 9L, 13L, 30L, 56L, 65L, 52L, 29L, 32L, 
55L, 28L, 35L, 54L, 25L, 24L, 19L, 20L, 34L, 53L, 27L, 7L, 17L, 
26L, 4L, 2L, 16L, 23L, 10L, 15L, 33L, 43L, 22L, 8L, 14L, 31L, 
42L, 18L, 11L, 12L, 21L, 9L, 13L, 30L, 56L, 73L, 76L, 78L, 60L, 
61L, 71L, 82L, 70L, 69L, 74L, 113L, 117L, 116L, 112L, 114L), .Label = c("                    <footnoteId id=F1/>", 
"                    <footnoteId id=F2/>", "                    <value>1206005</value>", 
"                    <value>27.10</value>", "                    <value>31.84</value>", 
"                    <value>44771</value>", "                    <value>600000</value>", 
"                    <value>606005</value>", "                    <value>By Ronin Trading, LLC</value>", 
"                    <value>D</value>", "                    <value>I</value>", 
"                </directOrIndirectOwnership>", "                </natureOfOwnership>", 
"                </sharesOwnedFollowingTransaction>", "                </transactionAcquiredDisposedCode>", 
"                </transactionPricePerShare>", "                </transactionShares>", 
"                <directOrIndirectOwnership>", "                <equitySwapInvolved>0</equitySwapInvolved>", 
"                <footnoteId id=F2/>", "                <natureOfOwnership>", 
"                <sharesOwnedFollowingTransaction>", "                <transactionAcquiredDisposedCode>", 
"                <transactionCode>S</transactionCode>", "                <transactionFormType>4</transactionFormType>", 
"                <transactionPricePerShare>", "                <transactionShares>", 
"                <value>2020-04-06</value>", "                <value>Common Stock</value>", 
"            </ownershipNature>", "            </postTransactionAmounts>", 
"            </securityTitle>", "            </transactionAmounts>", 
"            </transactionCoding>", "            </transactionDate>", 
"            <isDirector>0</isDirector>", "            <isOfficer>0</isOfficer>", 
"            <isOther>0</isOther>", "            <isTenPercentOwner>1</isTenPercentOwner>", 
"            <officerTitle></officerTitle>", "            <otherText></otherText>", 
"            <ownershipNature>", "            <postTransactionAmounts>", 
"            <rptOwnerCik>0001218981</rptOwnerCik>", "            <rptOwnerCity>CHICAGO</rptOwnerCity>", 
"            <rptOwnerName>STAFFORD JOHN S III</rptOwnerName>", 
"            <rptOwnerState>IL</rptOwnerState>", "            <rptOwnerStateDescription></rptOwnerStateDescription>", 
"            <rptOwnerStreet1>350 N. ORLEANS STREET</rptOwnerStreet1>", 
"            <rptOwnerStreet2>SUITE 2N</rptOwnerStreet2>", "            <rptOwnerZipCode>60654-1975</rptOwnerZipCode>", 
"            <securityTitle>", "            <transactionAmounts>", 
"            <transactionCoding>", "            <transactionDate>", 
"        </nonDerivativeTransaction>", "        </reportingOwnerAddress>", 
"        </reportingOwnerId>", "        </reportingOwnerRelationship>", 
"        <footnote id=F1>This transaction was executed in multiple trades at prices ranging from $31.48 to $32.30. The price reported above reflects the weighted average purchase price. The reporting person hereby undertakes to provide upon request to the SEC staff, the issuer or a security holder of the issuer full information regarding the number of shares and prices at which the transactions were effected.</footnote>", 
"        <footnote id=F2>The transaction was executed in a single, privately negotiated transaction with an institutional buyer.</footnote>", 
"        <issuerCik>0001326732</issuerCik>", "        <issuerName>Xencor Inc</issuerName>", 
"        <issuerTradingSymbol>XNCR</issuerTradingSymbol>", "        <nonDerivativeTransaction>", 
"        <reportingOwnerAddress>", "        <reportingOwnerId>", 
"        <reportingOwnerRelationship>", "        <signatureDate>2020-04-08</signatureDate>", 
"        <signatureName>/s/ John S. Stafford, III</signatureName>", 
"    </footnotes>", "    </issuer>", "    </nonDerivativeTable>", 
"    </ownerSignature>", "    </reportingOwner>", "    <derivativeTable></derivativeTable>", 
"    <documentType>4</documentType>", "    <footnotes>", "    <issuer>", 
"    <nonDerivativeTable>", "    <notSubjectToSection16>1</notSubjectToSection16>", 
"    <ownerSignature>", "    <periodOfReport>2020-04-06</periodOfReport>", 
"    <reportingOwner>", "    <schemaVersion>X0306</schemaVersion>", 
"\t\tBUSINESS PHONE:\t\t626-305-5900", "\t\tCENTRAL INDEX KEY:\t\t\t0001218981", 
"\t\tCENTRAL INDEX KEY:\t\t\t0001326732", "\t\tCITY:\t\t\tCHICAGO", 
"\t\tCITY:\t\t\tMONROVIA", "\t\tCOMPANY CONFORMED NAME:\t\t\tSTAFFORD JOHN S III", 
"\t\tCOMPANY CONFORMED NAME:\t\t\tXencor Inc", "\t\tFILM NUMBER:\t\t20782220", 
"\t\tFISCAL YEAR END:\t\t\t1231", "\t\tFORM TYPE:\t\t4", "\t\tIRS NUMBER:\t\t\t\t201622502", 
"\t\tSEC ACT:\t\t1934 Act", "\t\tSEC FILE NUMBER:\t001-36182", 
"\t\tSTANDARD INDUSTRIAL CLASSIFICATION:\tPHARMACEUTICAL PREPARATIONS [2834]", 
"\t\tSTATE OF INCORPORATION:\t\t\tDE", "\t\tSTATE:\t\t\tCA", 
"\t\tSTATE:\t\t\tIL", "\t\tSTREET 1:\t\t111 WEST LEMON AVE", 
"\t\tSTREET 1:\t\t230 SOUTH LASALLE STREET 400", "\t\tZIP:\t\t\t60604", 
"\t\tZIP:\t\t\t91016", "\tBUSINESS ADDRESS:\t", "\tCOMPANY DATA:\t", 
"\tFILING VALUES:", "\tMAIL ADDRESS:\t", "\tOWNER DATA:\t", "</DOCUMENT>", 
"</ownershipDocument>", "</SEC-DOCUMENT>", "</SEC-HEADER>", "</TEXT>", 
"</XML>", "<?xml version=1.0?>", "<ACCEPTANCE-DATETIME>20200408162604", 
"<DESCRIPTION>FORM 4 -", "<DOCUMENT>", "<FILENAME>edgar.xml", 
"<ownershipDocument>", "<SEC-DOCUMENT>0001179110-20-004802.txt : 20200408", 
"<SEC-HEADER>0001179110-20-004802.hdr.sgml : 20200408", "<SEQUENCE>1", 
"<TEXT>", "<TYPE>4", "<XML>", "ACCESSION NUMBER:\t\t0001179110-20-004802", 
"CONFORMED PERIOD OF REPORT:\t20200406", "CONFORMED SUBMISSION TYPE:\t4", 
"DATE AS OF CHANGE:\t\t20200408", "FILED AS OF DATE:\t\t20200408", 
"ISSUER:\t\t", "PUBLIC DOCUMENT COUNT:\t\t1", "REPORTING-OWNER:\t"
), class = "factor")), class = "data.frame", row.names = c(NA, 
-176L))

Помощь приветствуется! Спасибо!

...