Попытка написать Python Scrapper, который записывает данные с веб-страницы в CSV-файл
Попытался изменить способ, которым я пишу файл Python, также, если я удалю строки dataFrameCleaned = cleanDataUp (dataFrame)
csvData (dataFrameCleaned)
код выполняется, однако он не записывает данные в файл CSV
'''
write data to csv
'''
def csvData(dataFrame):
with open('threads.csv', 'w+', newline='', encoding='utf8') as csvfile:
fieldnames = ['post id', 'name', 'date of the post', 'post body']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for posts in dataFrame:
writer.writerow(posts)
print('file write complete')
'''
defaults
'''
if __name__ == "__main__":
path = str(os.path.dirname(os.path.realpath(__file__)))+'/data/'
reload(sys)
fieldnames = ['post id', 'name', 'date of the post', 'post body']
dataFrame = pd.DataFrame(columns=fieldnames)
url = 'http://www.oldclassiccar.co.uk/forum/phpbb/phpBB2/viewtopic.php?t=12591'
urlList = [url]
soup = get_soup(url)
while True:
newUrlSuffix = getURL(soup)
if newUrlSuffix == '':
break
newUrl = 'http://www.oldclassiccar.co.uk/forum/phpbb/phpBB2/' + newUrlSuffix
print("Adding new URL to list..")
urlList.append(newUrl)
soup = get_soup(newUrl)
for link in urlList:
print("Getting data from URL:" + link+ '\n\n\n')
dataFrameNew = extractData(link)
dataFrame = pd.concat([dataFrame,dataFrameNew])
dataFrameCleaned = cleanDataUp(dataFrame)
csvData(dataFrameCleaned)
The function for cleanDataUp
def cleanDataUp(dataFrame):
dataFrame = dataFrame.reset_index(drop=True).dropna()
return dataFrame
Traceback (most recent call last):
File "scraper.py", line 127, in <module>
csvData(dataFrameCleaned)
File "scraper.py", line 96, in csvData
writer.writerows(posts)
File "/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/csv.py", line 158, in writerows
return self.writer.writerows(map(self._dict_to_list, rowdicts))
File "/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/csv.py", line 148, in _dict_to_list
wrong_fields = rowdict.keys() - self.fieldnames
AttributeError: 'str' object has no attribute 'keys'