Рассмотрите возможность получения уникального списка идентификаторов между обоими фреймами данных с помощью merge
, затем отфильтруйте по спискам с помощью isin
или query
и объедините два подмножества:
shared_ids = pd.merge(emails, attachments, on='GROUPID')['GROUPID'].unique().tolist()
# CONCATENATE WITH LIST FILTERING (TWO EQUAL VERSIONS)
email_attachments = (pd.concat([emails[emails['GROUPID'].isin(shared_ids)],
attachments.query("GROUPID == @shared_ids")],
sort = False)
.reset_index(drop = True)
)
Для демонстрации с данные:
emails = pd.DataFrame({'Parent': [True, True],
'DOCID': ['DOC008', 'DOC023'],
'GROUPID': ['DOC008', 'DOC023'],
'FileName': ['epe model for 01/25/02', 'PLEASE READ'],
'FileType': ['Microsoft Outlook Note', 'Microsoft Outlook Note']})
attachments = pd. DataFrame({'EmailAttachment': ['Attachment']*4,
'DOCID': ['DOC008', 'DOC012', 'DOC024', 'DOC029'],
'GROUPID': ['DOC008', 'DOC011', 'DOC023', 'DOC028'],
'FileName': ['1-25act.xls', '1-29act.xls',
'Certification.doc', 'Certification.doc'],
'FileType': ['Microsoft Excel Spreadsheet', 'Microsoft Excel Spreadsheet',
'Microsoft Word Document', 'Microsoft Word Document']})
shared_ids = pd.merge(emails, attachments, on='GROUPID')['GROUPID'].unique().tolist()
email_attachments = (pd.concat([emails[emails['GROUPID'].isin(shared_ids)],
attachments.query("GROUPID == @shared_ids")],
sort = False)
.reset_index(drop = True)
)
email_attachments
# Parent DOCID GROUPID FileName FileType EmailAttachment
# 0 True DOC008 DOC008 epe model for 01/25/02 Microsoft Outlook Note NaN
# 1 True DOC023 DOC023 PLEASE READ Microsoft Outlook Note NaN
# 2 NaN DOC008 DOC008 1-25act.xls Microsoft Excel Spreadsheet Attachment
# 3 NaN DOC024 DOC023 Certification.doc Microsoft Word Document Attachment