Следующий код token_tree основан на классе prefix_tree из ZeroMQ
Класс prefix_tree возвращает «true» только в том случае, если один из префиксов дерева соответствует началу входного текста.Он даже не скажет вам, какой префикс или как долго этот префикс был.
Это token_tree будет искать самый длинный токен, который соответствует началу входного текста.Функция поиска token_tree_longest_token () должна только возвращать длину самого длинного токена, сопоставленного с началом входного текста.
Основной алгоритм аналогичен описанному в вопросе, но его реализация может быть быстрее.
Также есть несколько способов улучшить использование памяти, которые могли бы сделать это быстрее.
#include <stdint.h>
#include <stdlib.h>
/* #define TEST_TOKEN_TREE */
/*
* TODO: possible improvements, use multiple types of nodes: string/branch/leaf.
* The string node would replace a chain of normal token_nodes and save memory.
* This would require spliting a node to add branch points.
* Use these structs:
* struct token_node {
* uint32_t ref_count;
* uint8_t node_type; -- node is token_node_str/token_node_branch/token_node_leaf
* };
* struct token_node_str {
* token_node base;
* uint8_t reserved;
* uint16_t len; -- string length
* token_node *child; -- string nodes can only have one child.
* uint8_t str[0]; -- embedded string (not null-terminated)
* };
* struct token_node_branch {
* token_node base;
* uint8_t min; -- smallest char in child list.
* uint16_t count; -- child count.
* token_node *children[0];
* };
* struct token_node_leaf { -- leaf nodes have no children.
* token_node base;
* };
* This will save memory, but will make code much more complex.
*/
typedef struct token_tree token_tree;
typedef struct token_node token_node;
struct token_tree {
token_node *root; /**< root node of token tree. */
};
struct token_node {
uint32_t ref_count; /**< how many token references end at this node. */
uint8_t min; /**< smallest 'char' in children's list. */
uint8_t reserved; /**< padding. */
uint16_t count; /**< number of children. (max count = 256, so count must be 16bits) */
token_node *children[0]; /**< list of children nodes. index by (c - min) */
};
#define NODE_SIZE(count) (sizeof(token_node) + (sizeof(token_node *) * count))
static token_node *token_node_new(uint16_t count) {
token_node *node = calloc(1, NODE_SIZE(count));
node->count = count;
return node;
}
static void token_node_build_chain(token_node **pnode, const uint8_t *token, size_t len) {
token_node *node;
do {
/* the last node in the chain will have no children. */
node = token_node_new((len == 0) ? 0 : 1);
*pnode = node; /* add node to slot in parent's children list. */
if(len == 0) break;
/* new node will have one child. */
node->min = *token;
node->count = 1;
/* slot where next node will be saved. */
pnode = &(node->children[0]);
/* consume char. */
token++;
len--;
} while(1);
/* mark last node as end of a valid token. */
node->ref_count++;
}
static void token_node_free(token_node *node) {
uint32_t i;
uint32_t count = node->count;
/* free children nodes. */
for(i=0; i < count; i++) {
if(node->children[i]) token_node_free(node->children[i]);
}
free(node);
}
static void token_node_grow(token_node **pnode, uint8_t c) {
token_node *node = *pnode;
token_node **children;
uint8_t old_min = node->min;
uint16_t old_count = node->count;
uint32_t i;
uint8_t min;
uint16_t count;
if(c < old_min) {
min = c;
count = old_count + (old_min - min);
} else {
if(old_count == 0) {
/* the list was empty, so this is the first char. */
old_min = c;
}
min = old_min;
c -= old_min;
if(c < old_count) {
/* don't need to grow. */
return;
}
count = c + 1;
}
node = realloc(node, NODE_SIZE(count));
*pnode = node;
children = node->children;
/* if the 'min' value changed, then we need to move all the old slots up. */
if(old_min != min) {
uint32_t diff = old_min - min;
for(i=count-1; i >= diff; i--) {
children[i] = children[i - diff];
}
/* null new slots at start of children list. */
for(i=0; i < diff; i++) {
children[i] = NULL;
}
} else {
/* null new slots at end of children list. */
for(i=old_count; i < count; i++) {
children[i] = NULL;
}
}
node->min = min;
node->count = count;
}
static token_node **token_node_find_last_node(token_node **pnode, const uint8_t **ptoken, size_t *plen) {
const uint8_t *token = *ptoken;
size_t len = *plen;
uint32_t c;
token_node *node = *pnode;
while(node && len) {
/* next char. */
c = (*token);
/* if c < node->min, then it will underflow and be > node->count. */
c -= node->min;
/* make sure c is in range. */
if(c >= node->count) {
/*
* NOTE: we don't consume this char and "*pnode" will not be null.
* When adding tokens, this node will be grown to hold more children.
*/
break;
}
/* consume char. */
token++;
len--;
/* get pointer to next node's slot. */
pnode = &(node->children[c]);
node = *pnode;
}
*ptoken = token;
*plen = len;
/* return pointer to last node's slot. */
return pnode;
}
static void token_node_add(token_node **pnode, const uint8_t *token, size_t len) {
token_node *node;
/* find last node in chain for this token. */
pnode = token_node_find_last_node(pnode, &token, &len);
/* if full token was consumed then we found the last node for this token. */
if(!len) {
node = *pnode;
node->ref_count++;
return;
}
/* check if the children list of the last node needs to be grown. */
node = *pnode;
if(node) {
uint32_t c = *token;
/* consume char. */
token++;
len--;
/* grow node to make room for new char. */
token_node_grow(pnode, c);
node = *pnode; /* token_node_grow() may change the node's pointer. */
/* get slot for new child. */
pnode = &(node->children[c - node->min]);
}
/* build node chain for un-consumed part of token. */
token_node_build_chain(pnode, token, len);
}
static size_t token_node_longest_token(token_node *node, const uint8_t *text, size_t len) {
size_t last_token_len = 0;
size_t off = 0;
uint32_t c;
/* loop until we get a NULL node or run out of text. */
do {
if(node->ref_count > 0) {
/* found a token, keep track of it's length. */
last_token_len = off;
}
/* end of input text. */
if(off >= len) break;
/* next char. */
c = text[off];
/* if c < node->min, then it will underflow and be > node->count. */
c -= node->min;
/* make sure c is in range. */
if(c >= node->count) {
/* End of search, no more child nodes. */
break;
}
/* consume char. */
off++;
/* get pointer to next node's slot. */
node = node->children[c];
} while(node);
/* return length of largest token found. */
return last_token_len;
}
extern token_tree *token_tree_new() {
token_tree *tree = malloc(sizeof(token_tree));
tree->root = token_node_new(0);
return tree;
}
extern void token_tree_free(token_tree *tree) {
token_node_free(tree->root);
free(tree);
}
extern void token_tree_add(token_tree *tree, const char *token, size_t len) {
token_node_add(&(tree->root), token, len);
}
extern size_t token_tree_longest_token(token_tree *tree, const char *text, size_t len) {
return token_node_longest_token(tree->root, text, len);
}
#ifdef TEST_TOKEN_TREE
#include <stdio.h>
#include <string.h>
static const char *test_tokens[] = {
"s",
"stack",
"stackoverflow",
"over",
"overflow",
NULL,
};
static const char *test_input[] = {
"aastackoverasdfasdf",
"stack7777",
"777stack777",
"overstackflow",
NULL,
};
static void add_tokens(token_tree *tree, const char **tokens) {
int i;
for(i = 0; tokens[i] != NULL; i++) {
token_tree_add(tree, tokens[i], strlen(tokens[i]));
}
}
static void print_tokens(token_tree *tree, const char *text) {
size_t len = strlen(text);
size_t token_len;
printf("input: \"%s\"\n", text);
printf("tokens: [");
while(len) {
token_len = token_tree_longest_token(tree, text, len);
if(token_len > 0) {
printf("<%.*s>", (int)token_len, text);
} else {
printf("?");
token_len = 1;
}
text += token_len;
len -= token_len;
}
printf("]\n");
}
static void run_test(token_tree *tree, const char **texts) {
int i;
for(i = 0; texts[i] != NULL; i++) {
print_tokens(tree, texts[i]);
}
}
int main(int argc, char *argv[]) {
token_tree *tree = token_tree_new();
add_tokens(tree, test_tokens);
run_test(tree, test_input);
run_test(tree, test_tokens);
token_tree_free(tree);
}
#endif