Вот решение, использующее группы регулярных выражений для токенизации текста с использованием различных типов токенов.
Вы можете проверить код здесь https://jsfiddle.net/u3mvca6q/5/
Basic Regex explanation:
/ Regex start
(\w+) First group, words \w means ASCII letter with \w + means 1 or more letters
| or
(,|!) Second group, punctuation
| or
(\s) Third group, white spaces
/ Regex end
g "global", enables looping over the string to capture one element at a time
Regex result:
result[0] : default group : any match
result[1] : group1 : words
result[2] : group2 : punctuation , !
result[3] : group3 : whitespace
var basicRegex = /(\w+)|(,|!)|(\s)/g;
Advanced Regex explanation:
[a-zA-Z\u0080-\u00FF] instead of \w Supports some Unicode letters instead of ASCII letters only. Find Unicode ranges here https://apps.timwhitlock.info/js/regex
(\.\.\.|\.|,|!|\?) Identify ellipsis (...) and points as separate entities
You can improve it by adding ranges for special punctuation and so on
var advancedRegex = /([a-zA-Z\u0080-\u00FF]+)|(\.\.\.|\.|,|!|\?)|(\s)/g;
var basicString = "Hello, this is a random message!";
var advancedString = "Et en français ? Avec des caractères spéciaux ... With one point at the end.";
var result = null;
do {
result = basicRegex.exec(basicString)
} while(result != null)
var result = null;
do {
result = advancedRegex.exec(advancedString)
} while(result != null)
Array [ "Hello", "Hello", undefined, undefined ]
Array [ ",", undefined, ",", undefined ]
Array [ " ", undefined, undefined, " " ]
Array [ "this", "this", undefined, undefined ]
Array [ " ", undefined, undefined, " " ]
Array [ "is", "is", undefined, undefined ]
Array [ " ", undefined, undefined, " " ]
Array [ "a", "a", undefined, undefined ]
Array [ " ", undefined, undefined, " " ]
Array [ "random", "random", undefined, undefined ]
Array [ " ", undefined, undefined, " " ]
Array [ "message", "message", undefined, undefined ]
Array [ "!", undefined, "!", undefined ]