Я пытаюсь провести некоторый анализ данных по массовым патентным данным (данные обычно находятся здесь, но в настоящее время не работают - https://ped.uspto.gov/peds/).
Вот первая запись в JSON файл:
{
"PatentBulkData":[
{
"patentCaseMetadata":{
"applicationNumberText":{
"value":"15733015",
"electronicText":"15733015"
},
"filingDate":"2020-01-01",
"applicationTypeCategory":"Utility",
"partyBag":{
"applicantBagOrInventorBagOrOwnerBag":[
{
"applicant":[
{
"contactOrPublicationContact":[
{
"name":{
"personNameOrOrganizationNameOrEntityName":[
{
"personStructuredName":{
"firstName":"Birol",
"middleName":"",
"lastName":"Cimen"
}
}
]
},
"cityName":"Hengelo",
"geographicRegionName":{
"value":"",
"geographicRegionCategory":"STATE"
},
"countryCode":"NL"
}
]
}
]
},
{
"partyIdentifierOrContact":[
{
"name":{
"personNameOrOrganizationNameOrEntityName":[
{
"personStructuredName":{
"lastName":"Oppedahl Patent Law Firm LLC (Mink)"
}
}
]
},
"postalAddressBag":{
"postalAddress":[
{
"postalStructuredAddress":{
"addressLineText":[
{
"value":"P O Box 351240"
}
],
"cityName":"Westminster",
"geographicRegionName":[
{
"value":"CO"
}
],
"countryCode":"US",
"postalCode":"80035"
}
}
]
}
},
{
"value":"133517"
}
]
}
]
},
"groupArtUnitNumber":{
"value":"3771",
"electronicText":"3771"
},
"applicationConfirmationNumber":"7897",
"applicantFileReference":"FP01.P035 SST02US",
"priorityClaimBag":{
"priorityClaim":[
{
"ipOfficeName":"NETHERLANDS",
"applicationNumber":{
"applicationNumberText":"2019179"
},
"filingDate":"2017-07-05",
"sequenceNumber":"1"
}
]
},
"patentClassificationBag":{
"cpcClassificationBagOrIPCClassificationOrECLAClassificationBag":[
{
"ipOfficeCode":"US",
"mainNationalClassification":{
"nationalClass":"606",
"nationalSubclass":"133000"
}
}
]
},
"businessEntityStatusCategory":"SMALL",
"firstInventorToFileIndicator":"true",
"inventionTitle":{
"content":[
"Hair removal device for removing body hair on a body surface"
]
},
"applicationStatusCategory":"Application Dispatched from Preexam, Not Yet Docketed",
"applicationStatusDate":"2020-05-08",
"officialFileLocationCategory":"ELECTRONIC",
"patentPublicationIdentification":{
"publicationNumber":"US20200170371A1",
"publicationDate":"2020-06-04"
},
"relatedDocumentData":{
"parentDocumentDataOrChildDocumentData":[
{
"descriptionText":"This application is National Stage Entry of",
"applicationNumberText":"PCT/NL2018/050434",
"filingDate":"2018-07-04",
"parentDocumentStatusCode":"Published",
"patentNumber":""
}
]
}
},
"prosecutionHistoryDataBag":{
"prosecutionHistoryData":[
{
"eventDate":"2020-06-05",
"eventCode":"PG-ISSUE",
"eventDescriptionText":"PG-Pub Issue Notification"
},
{
"eventDate":"2020-05-11",
"eventCode":"M903",
"eventDescriptionText":"Notice of DO/EO Acceptance Mailed"
},
{
"eventDate":"2020-05-11",
"eventCode":"FLRCPT.U",
"eventDescriptionText":"Filing Receipt - Updated"
},
{
"eventDate":"2020-05-11",
"eventCode":"MPEN",
"eventDescriptionText":"Mail Pre-Exam Notice"
},
{
"eventDate":"2020-02-26",
"eventCode":"EML_NTR",
"eventDescriptionText":"Email Notification"
},
{
"eventDate":"2020-02-26",
"eventCode":"EML_NTR",
"eventDescriptionText":"Email Notification"
},
{
"eventDate":"2020-02-26",
"eventCode":"CCRDY",
"eventDescriptionText":"Application ready for PDX access by participating foreign offices"
},
{
"eventDate":"2020-01-05",
"eventCode":"371COMP",
"eventDescriptionText":"371 Completion Date"
},
{
"eventDate":"2020-02-25",
"eventCode":"PGPC",
"eventDescriptionText":"Sent to Classification Contractor"
},
{
"eventDate":"2020-02-25",
"eventCode":"FTFS",
"eventDescriptionText":"FITF set to YES - revise initial setting"
},
{
"eventDate":"2020-01-02",
"eventCode":"PTA.RFE",
"eventDescriptionText":"Patent Term Adjustment - Ready for Examination"
},
{
"eventDate":"2020-02-26",
"eventCode":"FLRCPT.O",
"eventDescriptionText":"Filing Receipt"
},
{
"eventDate":"2020-02-26",
"eventCode":"M903",
"eventDescriptionText":"Notice of DO/EO Acceptance Mailed"
},
{
"eventDate":"2019-12-31",
"eventCode":"SREXR141",
"eventDescriptionText":"PTO/SB/69-Authorize EPO Access to Search Results"
},
{
"eventDate":"2019-12-31",
"eventCode":"APPERMS",
"eventDescriptionText":"Applicants have given acceptable permission for participating foreign "
},
{
"eventDate":"2020-02-25",
"eventCode":"SMAL",
"eventDescriptionText":"Applicant Has Filed a Verified Statement of Small Entity Status in Compliance with 37 CFR 1.27"
},
{
"eventDate":"2019-12-31",
"eventCode":"L194",
"eventDescriptionText":"Cleared by OIPE CSR"
},
{
"eventDate":"2019-12-31",
"eventCode":"WIDS",
"eventDescriptionText":"Information Disclosure Statement (IDS) Filed"
},
{
"eventDate":"2019-12-31",
"eventCode":"WIDS",
"eventDescriptionText":"Information Disclosure Statement (IDS) Filed"
},
{
"eventDate":"2019-12-31",
"eventCode":"BIG.",
"eventDescriptionText":"ENTITY STATUS SET TO UNDISCOUNTED (INITIAL DEFAULT SETTING OR STATUS CHANGE)"
},
{
"eventDate":"2019-12-31",
"eventCode":"IEXX",
"eventDescriptionText":"Initial Exam Team nn"
}
]
},
"st96Version":"V3_1",
"ipoVersion":"US_V8_0"
},
Я импортирую данные json как словарь. Однако как лучше всего получить информацию, которую я хотел бы получить? Должен ли я использовать json .normalize, чтобы сгладить его и преобразовать в Dataframe?
Я хотел бы специально получить информацию в «proissionHistoryData». Например, с другими патентными заявками это предоставит конкретную c информацию о том, сколько действий ведомства было выдано.
В конце концов я хотел бы сделать перекрестную ссылку на эти данные действий ведомства от патентного эксперта (что будет можно найти в «заявкеBagOrInventorBagOrOwnerBag» при назначении экзаменатору).
Есть ли какие-нибудь хорошие ресурсы, объясняющие, как очистить json данные, которые я могу получить, разбив эту информацию на отдельные столбцы?
Спасибо за информацию! Вот пример с Examiner:
{
"patentCaseMetadata":{
"applicationNumberText":{
"value":"16732312",
"electronicText":"16732312"
},
"filingDate":"2020-01-01",
"applicationTypeCategory":"Utility",
"partyBag":{
"applicantBagOrInventorBagOrOwnerBag":[
{
"primaryExaminerOrAssistantExaminerOrAuthorizedOfficer":[
{
"name":{
"personNameOrOrganizationNameOrEntityName":[
{
"personFullName":"ORGAD, EDAN"
}
]
}
}
]
},
{
"applicant":[
{
"contactOrPublicationContact":[
{
"name":{
"personNameOrOrganizationNameOrEntityName":[
{
"organizationStandardName":{
"content":[
"Communication Systems LLC"
]
}
}
]
},
"cityName":"Santa Fe",
"geographicRegionName":{
"value":"NM",
"geographicRegionCategory":"STATE"
},
"countryCode":""
}
]
}
]
}
]
},
"groupArtUnitNumber":{
"value":"2414",
"electronicText":"2414"
},
"applicationConfirmationNumber":"8996",
"applicantFileReference":"CS1003US03",
"patentClassificationBag":{
"cpcClassificationBagOrIPCClassificationOrECLAClassificationBag":[
{
"ipOfficeCode":"US",
"mainNationalClassification":{
"nationalClass":"370",
"nationalSubclass":"329000"
}
}
]
},
"businessEntityStatusCategory":"SMALL",
"firstInventorToFileIndicator":"true",
"inventionTitle":{
"content":[
"APPARATUSES, METHODS, AND COMPUTER-READABLE MEDIUM FOR COMMUNICATION IN A WIRELESS LOCAL AREA NETWORK"
]
},
"applicationStatusCategory":"Docketed New Case - Ready for Examination",
"applicationStatusDate":"2020-02-07",
"officialFileLocationCategory":"ELECTRONIC",
"patentPublicationIdentification":{
"publicationNumber":"US20200154403A1",
"publicationDate":"2020-05-14"
}
},
"prosecutionHistoryDataBag":{
"prosecutionHistoryData":[
{
"eventDate":"2020-05-19",
"eventCode":"PG-ISSUE",
"eventDescriptionText":"PG-Pub Issue Notification"
}
]
},
"assignmentDataBag":{
"assignmentData":[
{
"reelNumber":"52436",
"frameNumber":"295",
"documentReceivedDate":"2020-04-20",
"recordedDate":"2020-04-20",
"mailDate":"2020-04-21",
"pageTotalQuantity":3,
"conveyanceText":"ASSIGNMENT OF ASSIGNORS INTEREST (SEE DOCUMENT FOR DETAILS).",
"assignorBag":{
"assignor":[
{
"executionDate":"2016-07-14",
"contactOrPublicationContact":[
{
"name":{
"personNameOrOrganizationNameOrEntityName":[
{
"value":"ATEFI, ALI"
}
]
}
}
]
}
]
},
"assigneeBag":{
"assignee":[
{
"contactOrPublicationContact":[
{
"name":{
"personNameOrOrganizationNameOrEntityName":[
{
"value":"COMMUNICATION SYSTEMS LLC"
}
]
},
"postalAddressBag":{
"postalAddress":[
{
"postalAddressText":[
{
"sequenceNumber":"1",
"value":"530-B HARKLE ROAD"
},
{
"sequenceNumber":"2",
"value":"STE. 100"
},
{
"sequenceNumber":"3",
"value":"SANTA FE NEW MEXICO 87505"
}
]
}
]
}
}
]
}
]
},
"correspondenceAddress":{
"partyIdentifierOrContact":[
{
"name":{
"personNameOrOrganizationNameOrEntityName":[
{
"value":"ALI ATEFI"
}
]
},
"postalAddressBag":{
"postalAddress":[
{
"postalAddressText":[
{
"sequenceNumber":"1",
"value":"530-B HARKLE ROAD"
},
{
"sequenceNumber":"2",
"value":"STE. 100"
},
{
"sequenceNumber":"3",
"value":"SANTA FE, NM 87505"
}
]
}
]
}
}
]
},
"sequenceNumber":"1"
}
],
"assignmentTotalQuantity":1
},
"st96Version":"V3_1",
"ipoVersion":"US_V8_0"
},
Мой синтаксический анализ не будет go мимо объекта ApplicBagOrInventorBagOrOwnerBag. Вот мой пример синтаксического анализа для попытки получить имя Examiner, который возвращает пустой фрейм данных:
jsonpath_expression = parse('PatentBulkData[*].patentCaseMetadata.partyBag.applicantBagOrInventorBagOrOwnerBag.primaryExaminerOrAssistantExaminerOrAuthorizedOfficer.name.personNameOrOrganizationNameOrEntityName.personFullName[*]')
Если я заканчиваю на поле ApplicBagOrInventorBagOrOwnerBag, я возвращаю фрейм данных с надлежащей информацией - только с скобками и всем остальным. JSON обозначение. Мне не хватает структуры ключей?
Еще раз спасибо!