У меня большой файл, более 89 миллионов строк. Я хочу прочитать файл, перенести его в хэш-таблицу и затем выполнить некоторые вычисления.
Проблема в том, что при istream
чтение файла и передача его в хеш-таблицу происходит слишком медленно.
Есть ли возможность чтения файла с использованием большего количества потоков? С библиотекой потоков?
Или мне нужно разрезать файл на маленькие кусочки, а затем использовать нить для каждого кусочка?
Хеш-функция не требует много времени для вычисления.
Для коллизий я использую списки. Номер таблицы составляет 1 млн.
// Adding_date_too_file.cpp : This file contains the 'main' function. Program execution begins and ends there.
//
#include "pch.h"
#include <iostream>
#include <string>
#include "hash.h"
#include <iostream>
#include <fstream>
using namespace std;
int main()
{
hasho Hashy;
string f1, f2, f3, f4, f5, f6, f7;
bool is_first_line = true;
fstream file_input;
fstream file_2_Collums;
cout << "Please give the name of the file that you want to run: \n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only two column\n which is going to be used for searching based on that file)" << flush;
while (true)
{
string infilename;
getline(cin, infilename);
file_input.open(infilename.c_str());
if (file_input)break;
cout << "Invalid file. Please enter a valid input file name> " << flush;
}
cout << "Please give the name of the file that you want to run: \n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only one column )" << flush;
while (true)
{
string infilename;
getline(cin, infilename);
file_2_Collums.open(infilename.c_str());
if (file_2_Collums)break;
cout << "Invalid file. Please enter a valid input file name> " << flush;
}
//creating output file
int * table;
table = new int[2];
int count_file_lines = 0;
int line_counter_inventors = 0;
if (file_input.is_open())
{
while (!file_input.eof())
{
if (is_first_line == true) {
getline(file_input, f1, '\n');
is_first_line = false;
}
getline(file_input, f1, '\t');// patent id
getline(file_input, f2, '\t');// patent id
getline(file_input, f3, '\t');// patent id
getline(file_input, f3, '\t');// patent id
getline(file_input, f6, '\t');// patent id
getline(file_input, f3, '\n');//date
//cout << "adding these items " << f1 << '\t' << f6 << endl;
Hashy.AddItem(f2, f6);
cout << count_file_lines << endl;
count_file_lines++;
// cout << f2 << '\t' << f6 << endl;
}
}
int lines_2 = 0;
if (file_2_Collums.is_open())
{
Hashy.openOutputFile();
while (!file_2_Collums.eof())
{
getline(file_2_Collums, f1, '\t');//patent_id
getline(file_2_Collums, f4, '\n');//assignee_id
//cout << f1 << endl;
Hashy.FindDateId(f1, f4);
lines_2++;
}
}
system("pause");
return 0;}
Hash.cpp
#include "pch.h"
#include <iostream>
#include <string>
#include "hash.h"
#include "hash.h"
#include <fstream>
using namespace std;
static ofstream output_file;
hasho::hasho()
{
for (int i = 0; i < tableSize; i++) {
//cout << i << endl;
HashTable[i] = new item;
HashTable[i]->pattent_id = "empty";
HashTable[i]->date = "empty";
HashTable[i]->next = NULL;
}
}
void hasho::openOutputFile() {
cout << "Please give the name of the output file: \n(The file should end with the format type (txt,csv etc.)) " << flush;
while (true)
{
string infilename;
getline(cin, infilename);
output_file.open(infilename.c_str(), fstream::out);
break;
}
}
int hasho::NumberOfItemsInIndex(int index) {
int count = 0;
if (HashTable[index]->date == "empty") {
return count;
}
else {
count++;
item* ptr = HashTable[index];
while (ptr->next != NULL) {
count++;
ptr = ptr->next;
}
}
return count;
}
void hasho::PrintTable() {
int number;
for (int i = 0; i < tableSize; i++) {
number = NumberOfItemsInIndex(i);
cout << "---------------------------------------\n";
cout << "index= " << i << endl;
cout << HashTable[i]->pattent_id << endl;
cout << HashTable[i]->date << endl;
cout << "# of items= " << number << endl;
cout << "---------------------------------------\n";
}
}
void hasho::PrintItemsInIndex(int index) {
item* ptr = HashTable[index];
if (ptr->date == "empty") {
cout << "index = " << index << " is empty." << endl;
}
else {
cout << "index = " << index << " contains the following items\n";
while (ptr != NULL) {
cout << "-----------" << endl;
cout << ptr->date << endl;
cout << ptr->pattent_id << endl;
cout << "-----------" << endl;
ptr = ptr->next;
}
}
}
void hasho::AddItem(string pattend_id, string date)
{
int index = Hash(pattend_id);
if (HashTable[index]->pattent_id == "empty")
{
HashTable[index]->pattent_id = pattend_id;
HashTable[index]->date = date;
}
else {
item* ptr = HashTable[index];
item* n = new item;
n->date = date ;
n->pattent_id = pattend_id;
n->next = NULL;
while (ptr->next != NULL) {
ptr = ptr->next;
}
ptr->next = n;
}
}
void hasho::FindDateId(string pattend_id, string assignee_id1) {
int found = 0;
int nfound = 0;
int index = Hash(pattend_id);
bool foundDateId = false;
string date;
item* ptr = HashTable[index];
int count = 1;
//write to file
//cout << "WE are looking for the date of " <<pattend_id << " in Index: " << index <<endl;
while (ptr != NULL) {
//cout << "NOw we are looking with : " << pattend_id << endl;
if (ptr->pattent_id == pattend_id) {
//cout << "NOw we are looking with : " << pattend_id <<endl;
foundDateId = true;
date = ptr->date;
//write to file
output_file << pattend_id << "\t";
output_file << assignee_id1 << endl;
output_file << date << "\t";
//cout << " " << date << endl;
found = 1;
count++;
}
ptr = ptr->next;
}
if (foundDateId == false) {
nfound++;
}
cout << "found " << found << endl;
cout << "not found " << nfound << endl;
cout << endl;
}
int hasho::Hash(string key)
{
int unsigned hash = 0;
int unsigned index;
//cout << key << endl;
for (int unsigned i = 0; i < key.length(); i++) {
hash = hash + (int)key[i] *(i+1);
}
index =hash % tableSize;
//cout << index << endl;
return index;
}
Hash.h
#pragma once
#include "pch.h"
#include <iostream>
#include <string>
//#include "hash.cpp"
using namespace std;
#pragma comment(linker, "/STACK:7000000")
#pragma comment(linker, "/HEAP:7000000")
#ifndef HASH_H
#define HASH_H
class hasho {
private:
static const int tableSize = 300003;
struct item {
string pattent_id;
string date;
item* next;
};
item* HashTable[tableSize];
public:
hasho();
int Hash(string key);
void AddItem(string pattend_id, string date);
int NumberOfItemsInIndex(int index);
void PrintTable();
void PrintItemsInIndex(int index);
void FindDateId(string pattent_id, string assgnee_id);
void openOutputFile();
};
#endif // ! HASH_H