Я отделяю слова (с идентификаторами) от предложений и помещаю слова в нижний регистр, потому что вы все равно хотите поиск без учета регистра.Если я найду два совпадения в одной и той же позиции в предложении, я выберу более длинное.Если есть перекрытия («ручное тестирование» и «стратегия тестирования»), я всегда выбираю «слово», которое стоит первым в предложении.
С наилучшими пожеланиями, Стью Эштон
SQL> Create table temp(
2 id NUMBER,
3 word VARCHAR2(1000),
4 Sentence VARCHAR2(2000)
5 );
SQL> insert into temp
2 SELECT 1,'automation testing', 'automtestingation TeStInG TEST is popular kind of testing' FROM DUAL UNION ALL
3 SELECT 2,'testing','manual testing' FROM DUAL UNION ALL
4 select 2,'test', 'test' FROM DUAL UNION ALL
5 SELECT 3,'manual testing','this is an old method of testing' FROM DUAL UNION ALL
6 SELECT 4,'punctuation','automation Testing,manual tEsting,punctuation,automanual teSting-tesTing' FROM DUAL UNION ALL
7 SELECT 5,'B-number analysis','B-number analysis table' FROM DUAL UNION ALL
8 SELECT 6,'B-number analysis table','testing B-number analysis' FROM DUAL UNION ALL
9 SELECT 7,'Not Matched','Testing tEsting teSting' FROM DUAL;
SQL> create table sentences as select sentence from temp;
SQL> create table words cache as
2 select length(word) word_length,
3 min(id) id,
4 lower(word) word
5 from temp
6 group by length(word), lower(word);
SQL> insert into sentences
2 select listagg(word, ',') within group(order by word)
3 from words;
SQL> insert into sentences values('Nothing matches here');
SQL> commit;
SQL> declare
2 cursor cur_sentences is
3 select rowid rid, sentence from sentences s
4 where exists (
5 select null from words
6 where instr(lower(s.sentence), word) > 0
7 )
8 for update;
9 type tt_sentences is table of cur_sentences%rowtype;
10 lt_sentences tt_sentences;
11 lt_sentences_new tt_sentences;
13 function change_sentence(p_sentence in sentences.sentence%type)
14 return sentences.sentence%type is
15 cursor cur_words(cp_sentence in sentences.sentence%type) is
16 with recurse (pos, word_length, id, word) as (
17 select regexp_instr(cp_sentence, '(^|\W)('||word||')(\W|$)', 1, 1, 0, 'i', 2),
18 word_length, id, word
19 from words
20 where regexp_instr(cp_sentence, '(^|\W)('||word||')(\W|$)', 1, 1, 0, 'i', 2) > 0
21 union all
22 select regexp_instr(cp_sentence, '(^|\W)('||word||')(\W|$)', pos+1, 1, 0, 'i', 2),
23 word_length, id, word
24 from recurse
25 where regexp_instr(cp_sentence, '(^|\W)('||word||')(\W|$)', pos+1, 1, 0, 'i', 2) > 0
26 )
27 select pos, word_length, id, word,
28 substr(cp_sentence, pos, length(word)) new_word
29 from recurse
30 order by pos, word_length desc;
31 type tt_words is table of cur_words%rowtype;
32 lt_words tt_words;
33 lt_words_kept tt_words:= new tt_words();
34 l_pos number := 0;
35 l_sentence sentences.sentence%type := p_sentence;
36 begin
37 open cur_words(p_sentence);
38 fetch cur_words bulk collect into lt_words;
39 for i in 1..lt_words.count loop
40 if l_pos < lt_words(i).pos then
41 l_pos := lt_words(i).pos + lt_words(i).word_length;
42 lt_words_kept.extend;
43 lt_words_kept(lt_words_kept.count) := lt_words(i);
44 end if;
45 end loop;
46 close cur_words;
47 for i in reverse 1..lt_words_kept.count loop
48 l_sentence := regexp_replace(
49 l_sentence,
50 lt_words_kept(i).new_word,
51 'http://localhost/'||lt_words_kept(i).id||'/<u>'||lt_words_kept(i).new_word||'</u>',
52 lt_words_kept(i).pos,
53 1
54 );
55 end loop;
56 return l_sentence;
57 exception when others then
58 close cur_words;
59 raise;
60 end change_sentence;
62 begin
63 open cur_sentences;
64 loop
65 fetch cur_sentences bulk collect into lt_sentences limit 100;
66 exit when lt_sentences.count = 0;
67 lt_sentences_new := new tt_sentences();
68 lt_sentences_new.extend(lt_sentences.count);
69 for i in 1..lt_sentences.count loop
70 lt_sentences_new(i).sentence := change_sentence(lt_sentences(i).sentence);
71 end loop;
72 forall i in 1..lt_sentences.count
73 update sentences set sentence = lt_sentences_new(i).sentence where rowid = lt_sentences(i).rid;
74 exit when cur_sentences%notfound;
75 end loop;
76 close cur_sentences;
77 exception when others then
78 if cur_sentences%isopen then
79 close cur_sentences;
80 raise;
81 end if;
82 end;
83 /
PL/SQL procedure successfully completed.
SQL> select * from sentences order by 1;
Nothing matches here
automtestingation http://localhost/2/<u>TeStInG</u> http://localhost/2/<u>TEST</u> is popular kind of http://localhost/2/<u>testing</u>
http://localhost/1/<u>automation Testing</u>,http://localhost/3/<u>manual tEsting</u>,http://localhost/4/<u>punctuation</u>,automanual http://localhost/2/<u>teSting</u>-http://localhost/2/<u>tesTing</u>
http://localhost/1/<u>automation testing</u>,http://localhost/5/<u>b-number analysis</u>,http://localhost/6/<u>b-number analysis table</u>,http://localhost/3/<u>manual testing</u>,http://localhost/7/<u>not matched</u>,http://localhost/4/<u>punctuation</u>,http://localhost/2/<u>test</u>,http://localhost/2/<u>testing</u>
http://localhost/2/<u>Testing</u> http://localhost/2/<u>tEsting</u> http://localhost/2/<u>teSting</u>
http://localhost/2/<u>testing</u> http://localhost/5/<u>B-number analysis</u>
http://localhost/3/<u>manual testing</u>
http://localhost/6/<u>B-number analysis table</u>
this is an old method of http://localhost/2/<u>testing</u>