Если вы знаете, сколько у вас будет уникальных записей, вы можете выполнить кластеризацию текста:
> disease.df
[,1]
row1 "SEPTICEMIA OR SEVERE SEPSIS W/O MV 96+ HOURS W/O MCC"
row2 "SEPTICEMIA OR SEVERE SEPSIS W/O MV >96 HOURS W/O MCC"
row3 "ECMO OR TRACH W MV 96+ HRS OR PDX EXC FACE, MOUTH & NECK W MAJ"
row4 "ECMO OR TRACH W MV >96 HRS OR PDX EXC FACE, MOUTH & NECK W MAJ"
> d <- adist(disease.df)
> rownames(d) <- disease.df
> hc <- hclust(as.dist(d))
> data.frame(disease.df,cutree(hc,k=2))
disease.df
cutree.hc..k...2.
row1 SEPTICEMIA OR SEVERE SEPSIS W/O MV 96+ HOURS W/O MCC
1
row2 SEPTICEMIA OR SEVERE SEPSIS W/O MV >96 HOURS W/O MCC
1
row3 ECMO OR TRACH W MV 96+ HRS OR PDX EXC FACE, MOUTH & NECK W MAJ
2
row4 ECMO OR TRACH W MV >96 HRS OR PDX EXC FACE, MOUTH & NECK W MAJ
2