KeyError reverse HashMap - простая модель сотрудничества KNN на языке Python - PullRequest
0 голосов
/ 13 марта 2019

У меня проблема с моим рекомендателем.Для некоторых сортов пива я хочу, чтобы рекомендации работали идеально, но иногда он возвращает KeyError.Я понятия не имею, почему это происходит?

Всегда происходит, когда вы пытаетесь рекомендовать один и тот же элемент, поэтому он, вероятно, как-то связан с кодом hashmap или обратным hashmap.

ИзображениеKeyError при запуске скрипта

** Изображение **

Код

import os
import time
import gc
import argparse
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz

class KnnRecommender:
    """
    This is an item based collaborative filtering recommender with KNN implemented by sklearn
    """
    def __init__(self, path_beers, path_tastingprofiles):
        """
        Recommender requires path to data: movies data and ratings data
        Parameters
        ----------
        path_movies: str, movies data file path
        path_ratings: str, ratings data file path
        """
        self.path_beers = path_beers
        self.path_tastingprofiles = path_tastingprofiles
        self.model = NearestNeighbors()

    def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):
        """
        set model params for sklearn.neighbors.NearestNeighbors
        Parameters
        ----------
        n_neighbors: int, optional (default = 5)
        algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        metric: string or callable, default 'minkowski', or one of
            ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
        n_jobs: int or None, optional (default=None)
        """
        if n_jobs and (n_jobs > 1 or n_jobs == -1):
            os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
        self.model.set_params(**{
            'n_neighbors': n_neighbors,
            'algorithm': algorithm,
            'metric': metric,
            'n_jobs': n_jobs})

    def _prep_data(self):
        """
        prepare data for recommender
        1. beer-tastingprofile scipy sparse matrix
        2. hashmap of beer to row index in beer-tastingprofile scipy sparse matrix
        """
        # read data
        df_beers = pd.read_csv(
            os.path.join(self.path_beers),
            usecols=['beerID', 'name', 'beertypeID'],
            dtype={'beerID': 'int32', 'name': 'str', 'beerID': 'int32'})
        df_tastingprofiles = pd.read_csv(
            os.path.join(self.path_tastingprofiles),
            usecols=['beerID', 'malty', 'sweet', 'sour', 'hoppy', 'bitter', 'fruity'],
            dtype={'beerID': 'int32', 'malty': 'float32', 'sweet': 'float32', 'sour': 'float32', 'hoppy': 'float32', 'bitter': 'float32', 'fruity': 'float32'})


        #filtering beers/removing unprofiled beers
        df_beers_merged = pd.merge(df_tastingprofiles, df_beers, on='beerID')
        df_beers = df_beers_merged.drop(['malty', 'sweet', 'sour', 'hoppy', 'bitter', 'fruity'], axis=1)

        # pivot and create tastingprofile matrix
        df_tastingprofile_features = df_tastingprofiles.set_index('beerID')

        # create mapper from beer name to index
        hashmap = {
            beer: i for i, beer in
            enumerate(list(df_beers.set_index('beerID').loc[df_tastingprofile_features.index].name)) # noqa
        }

        #converting tastingprofile features to scipy sparse matrix
        mat_tastingprofile_features = csr_matrix(df_tastingprofile_features.values)


        # clean up
        del df_beers, df_beers_merged
        del df_tastingprofiles, df_tastingprofile_features
        return mat_tastingprofile_features, hashmap

    def _fuzzy_matching(self, hashmap, fav_beer):
        """
        return the closest match via fuzzy ratio.
        If no match found, return None
        Parameters
        ----------
        hashmap: dict, map beer name to index of the beer in data
        fav_beer: str, name of user input beer
        Return
        ------
        index of the closest match
        """
        match_tuple = []
        # get match
        for name, idx in hashmap.items():
            ratio = fuzz.ratio(name.lower(), fav_beer.lower())
            if ratio >= 60:
                match_tuple.append((name, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print('Oops! No match is found')
        else:
            print('Found possible matches in our database: '
                  '{0}\n'.format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

    def _inference(self, model, data, hashmap,
                   fav_beer, n_recommendations):
        """
        return top n similar beer recommendations based on user's input movie
        Parameters
        ----------
        model: sklearn model, knn model
        data: beer-tastingprofile matrix
        hashmap: dict, map beer name to index of the mobeervie in data
        fav_beer: str, name of user input beer
        n_recommendations: int, top n recommendations
        Return
        ------
        list of top n similar beer recommendations
        """
        # fit
        model.fit(data)
        # get input movie index
        print('You have input movie:', fav_beer)
        idx = self._fuzzy_matching(hashmap, fav_beer)
        # inference
        print('Recommendation system start to make inference')
        print('......\n')
        t0 = time.time()
        distances, indices = model.kneighbors(
            data[idx],
            n_neighbors=n_recommendations+1)
        # get list of raw idx of recommendations
        raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]
        print('It took my system {:.2f}s to make inference \n\
              '.format(time.time() - t0))
        # return recommendation (movieId, distance)
        return raw_recommends

    def make_recommendations(self, fav_beer, n_recommendations):
        """
        make top n beer recommendations
        Parameters
        ----------
        fav_beer: str, name of user input beer
        n_recommendations: int, top n recommendations
        """
        # get data
        mat_tastingprofile_features, hashmap = self._prep_data()
        # get recommendations
        raw_recommends = self._inference(
            self.model, mat_tastingprofile_features, hashmap,
            fav_beer, n_recommendations)
        # print results
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        print('Recommendations for {}:'.format(fav_beer))
        for i, (idx, dist) in enumerate(raw_recommends):
            #reverse_hashmap[idx]
            print('{0}: {1}, with distance of {2}'.format(i+1,reverse_hashmap[idx], dist))

def parse_args():
    parser = argparse.ArgumentParser(
        prog="Beer Recommender",
        description="Run KNN Beer Recommender")
    parser.add_argument('--path', nargs='?', default='',
                         help='input data path')
    parser.add_argument('--beer_filename', nargs='?', default='beer.csv',
                        help='provide beer filename')
    parser.add_argument('--tastingprofile_filename', nargs='?', default='tastingprofile.csv',
                        help='provide tastingprofile filename')
    parser.add_argument('--beer_name', nargs='?', default='',
                        help='provide your favorite beer name')
    parser.add_argument('--top_n', type=int, default=10,
                        help='top n beer recommendations')
    return parser.parse_args()    

if __name__ == '__main__':
    # get args
    args = parse_args()
    data_path = args.path
    beer_filename = args.beer_filename
    tastingprofile_filename = args.tastingprofile_filename
    beer_name = args.beer_name
    top_n = args.top_n
    # initial recommender system
    recommender = KnnRecommender(
    os.path.join(data_path, beer_filename),
    os.path.join(data_path, tastingprofile_filename))
    recommender.set_model_params(20, 'brute', 'cosine', -1)
    # make recommendations
    recommender.make_recommendations(beer_name, top_n)

1 Ответ

0 голосов
/ 13 марта 2019

Я исправил это. Выяснилось, что когда я использую имена в качестве значений в Hashmap, дубликаты автоматически удаляются. Таким образом, hashmap был меньше, чем полный список БД. Я решил это, удалив дубликаты в моем наборе данных, прежде чем использовать его для чего-либо в алгоритме рекомендации.

Я покажу вам мое простое исправление с объединением панд данных и кадров drop_duplicates.

#Remove duplicates from Beers dataset
        df_beers_noduplicates = df_beers.drop_duplicates(subset='name', keep='first', inplace=False)
        df_beers_merged = pd.merge(df_tastingprofiles, df_beers_noduplicates, on='beerID')
        df_beers = df_beers_merged.drop(['malty', 'sweet', 'sour', 'hoppy', 'bitter', 'fruity'], axis=1)
        df_tastingprofiles = df_beers_merged.drop(['name'], axis=1)
...