@ CraigBarnes правильно, что UnicodeData.txt содержит все символы. Вот некоторые доказательства (код Python):
import csv
D = {}
with open('UnicodeData.txt',encoding='utf-8-sig') as f:
r = csv.reader(f,delimiter=';')
for line in r:
# Count all the CJK Ideograph and Hangul Syllable ranges and generate names
if ('Ideograph' in line[1] or line[1].startswith('<Hangul')) and line[1].endswith('First>'):
end = next(r)
for i in range(int(line[0],16),int(end[0],16)+1):
D[i] = [line[1][1:-8].upper() + '-' + f'{i:04X}'] + line[2:]
elif line[1][0] == '<':
continue # skip private use and control characters
else:
D[int(line[0],16)] = line[1:] # count everything else as one entry
print(len(D))
Выход:
137929