Python 3.x Как извлечь алфавитное c представление чисел из строки - PullRequest
0 голосов
/ 09 июля 2020

Мне нужно разобрать текст, который может содержать алфавитные числа. Например

"I`ve got sixty six tasks"

или

"There is four people"

Моя цель - получить подстроку sixty six и four

В inte rnet много подходов при преобразовании числовое представление строки в целое число, без дополнительных текстов. Но мне нужен следующий результат:

find_numbers("Hello world") -> []
find_numbers("Hello five world") -> ['five']

Ответы [ 2 ]

0 голосов
/ 09 июля 2020

Так как мне было скучно, я модифицировал версию word_to_num. Некоторые проверки ошибок не включены, но вы можете добавить их, если хотите. числа в группы, а затем подает каждую из этих групп в алгоритм word2number s.

american_number_system = {
    'zero': 0,
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'ten': 10,
    'eleven': 11,
    'twelve': 12,
    'thirteen': 13,
    'fourteen': 14,
    'fifteen': 15,
    'sixteen': 16,
    'seventeen': 17,
    'eighteen': 18,
    'nineteen': 19,
    'twenty': 20,
    'thirty': 30,
    'forty': 40,
    'fifty': 50,
    'sixty': 60,
    'seventy': 70,
    'eighty': 80,
    'ninety': 90,
    'hundred': 100,
    'thousand': 1000,
    'million': 1000000,
    'billion': 1000000000
}


def number_formation(number_words):
    numbers = []
    for number_word in number_words:
        numbers.append(american_number_system[number_word])
    if len(numbers) == 4:
        return (numbers[0] * numbers[1]) + numbers[2] + numbers[3]
    elif len(numbers) == 3:
        return numbers[0] * numbers[1] + numbers[2]
    elif len(numbers) == 2:
        if 100 in numbers:
            return numbers[0] * numbers[1]
        else:
            return numbers[0] + numbers[1]
    else:
        return numbers[0]

def get_decimal_sum(decimal_digit_words):
    decimal_number_str = []
    for dec_word in decimal_digit_words:
        if(dec_word not in decimal_words):
            return 0
        else:
            decimal_number_str.append(american_number_system[dec_word])
    final_decimal_string = '0.' + ''.join(map(str,decimal_number_str))
    return float(final_decimal_string)

def to_num(string):
    string = string.replace('-', ' ')
    string = string.replace(',', ' ')
    words = string.strip().split()

    number_groups = []
    current_word = []
    numbers = []

    for word in words:
        if word in american_number_system:
            current_word.append(word)
        elif word.lower() != 'and' and len(current_word):
            number_groups.append(current_word)
            current_word = []
    
    if len(current_word):
        number_groups.append(current_word)

    for clean_numbers in number_groups:
        clean_decimal_numbers = []
        total_sum = 0

        if clean_numbers.count('point') == 1:
            clean_decimal_numbers = clean_numbers[clean_numbers.index('point')+1:]
            clean_numbers = clean_numbers[:clean_numbers.index('point')]

        billion_index = clean_numbers.index('billion') if 'billion' in clean_numbers else -1
        million_index = clean_numbers.index('million') if 'million' in clean_numbers else -1
        thousand_index = clean_numbers.index('thousand') if 'thousand' in clean_numbers else -1

        if len(clean_numbers) == 1:
                total_sum += american_number_system[clean_numbers[0]]

        else:
            if billion_index > -1:
                billion_multiplier = number_formation(clean_numbers[0:billion_index])
                total_sum += billion_multiplier * 1000000000

            if million_index > -1:
                if billion_index > -1:
                    million_multiplier = number_formation(clean_numbers[billion_index+1:million_index])
                else:
                    million_multiplier = number_formation(clean_numbers[0:million_index])
                total_sum += million_multiplier * 1000000

            if thousand_index > -1:
                if million_index > -1:
                    thousand_multiplier = number_formation(clean_numbers[million_index+1:thousand_index])
                elif billion_index > -1 and million_index == -1:
                    thousand_multiplier = number_formation(clean_numbers[billion_index+1:thousand_index])
                else:
                    thousand_multiplier = number_formation(clean_numbers[0:thousand_index])
                total_sum += thousand_multiplier * 1000

            if thousand_index > -1 and thousand_index != len(clean_numbers)-1:
                hundreds = number_formation(clean_numbers[thousand_index+1:])
            elif million_index > -1 and million_index != len(clean_numbers)-1:
                hundreds = number_formation(clean_numbers[million_index+1:])
            elif billion_index > -1 and billion_index != len(clean_numbers)-1:
                hundreds = number_formation(clean_numbers[billion_index+1:])
            elif thousand_index == -1 and million_index == -1 and billion_index == -1:
                hundreds = number_formation(clean_numbers)
            else:
                hundreds = 0
            total_sum += hundreds
        
        if len(clean_decimal_numbers) > 0:
            decimal_sum = get_decimal_sum(clean_decimal_numbers)
            total_sum += decimal_sum
        
        numbers.append(total_sum)
    
    return numbers

tests = []
tests.append(to_num("I`ve got sixty six tasks"))
tests.append(to_num("There is four people"))
tests.append(to_num("Hello world"))
tests.append(to_num("Hello five world"))
tests.append(to_num("i have three apples and two bananas"))
tests.append(to_num("three hundred twenty eight"))

print(tests)

Отсюда вы можете просто использовать num2words, чтобы отменить результат.

Изменить:

На самом деле, перечитывая ваш вопрос, это намного проще. вам просто нужно найти позиции этих чисел и извлечь их.

american_number_system = {
    'zero': 0,
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'ten': 10,
    'eleven': 11,
    'twelve': 12,
    'thirteen': 13,
    'fourteen': 14,
    'fifteen': 15,
    'sixteen': 16,
    'seventeen': 17,
    'eighteen': 18,
    'nineteen': 19,
    'twenty': 20,
    'thirty': 30,
    'forty': 40,
    'fifty': 50,
    'sixty': 60,
    'seventy': 70,
    'eighty': 80,
    'ninety': 90,
    'hundred': 100,
    'thousand': 1000,
    'million': 1000000,
    'billion': 1000000000
}

def extract_num(raw_string):
    string = raw_string.replace('-', ' ')
    string = string.replace(',', ' ')
    words = string.strip().split()

    word_pos = False
    numbers = []
    current_pos = 0

    for word in words:
        if word in american_number_system:
            if word_pos:
                length = len(word) + 1
                word_pos = (word_pos[0], word_pos[1] + length)
            else:
                length = len(word)
                word_pos = (current_pos, current_pos + length)
        elif word.lower() == 'and' and word_pos:
            word_pos = (word_pos[0], word_pos[1] + 4)
        elif word_pos:
            numbers.append(raw_string[word_pos[0]:word_pos[1]])
            word_pos = False
        
        current_pos += len(word) + 1
    
    if word_pos:
        numbers.append(raw_string[word_pos[0]:])
    
    return numbers


tests = []
tests.append(extract_num("I`ve got sixty six tasks"))
tests.append(extract_num("There is four people"))
tests.append(extract_num("Hello world"))
tests.append(extract_num("Hello five world"))
tests.append(extract_num("i have three apples and two bananas"))
tests.append(extract_num("three hundred twenty eight"))

print(tests)
0 голосов
/ 09 июля 2020

Для этого вам нужно использовать 2 библиотеки:

word2number извлечет число из строки.

например,

>>> print(w2n.word_to_num("Hello five world"))
5

Затем вы можете использовать библиотеку num2words для преобразования вывода обратно в слово:

>>> print(num2words(5))
five
...