Здесь - обсуждение аналогичной, но более сложной проблемы «очистки» текста HTML в соответствии со списком правил (вам нужно зарегистрироваться на этом сайте, но это все). В обсуждение включены несколько способов сделать это с T-SQL и один способ сделать это, самый быстрый на сегодняшний день, с SQLCLR. Поскольку я написал решение VB.Net/SQLCLR, я включил его ниже.
Вот текст замены / преобразования, которые он реализует:
- Удалить теги и содержимое скрипта
- Заменить все теги HTML пробелами.
- Заменить пробелом
- Заменить все коды сущностей («& xxx;») на X
- Заменить все знаки пунктуации и математические символы (.,;: '”& () [] + / <> ≥≤ ° ÷) с пробелом (тире не заменяются) Добавить проценты, косую черту, подчеркивание, карат, звездочку , знак равенства, фигурные скобки, вопросительные и восклицательные знаки, трубы, знаки доллара и центов, знак фунта, табуляция, crlf
- Заменить все цифры пробелом
- Заменить все однобуквенные слова пробелом (образец: пробел, одиночный символ, подстановочный знак)
- Удалить лишние пробелы.
, который реализует тип текстового преобразователя DFSA (детерминированный конечный автомат, ну, это почти детерминированный, потому что он делает просмотр в нескольких местах):
Imports System
Imports System.Data
Imports System.Data.SqlClient
Imports System.Data.SqlTypes
Imports Microsoft.SqlServer.Server
Partial Public Class UserDefinedFunctions
Public Enum States
Space1
Entity
HTMLTag
Norm
Word1
Script
Style
End Enum
Enum SubStates
None
EndBegin
EndSlash
End Enum
Const CharSpace As Integer = 32
Const CharAmp As Integer = 38
Const CharSlash As Integer = 47
Const CharLT As Integer = 60
Const CharGT As Integer = 62
Const CharA As Integer = 65
Const CharX As Integer = 88
Const CharZ As Integer = 90
Const Char_a As Integer = 97
Const Char_b As Integer = 98
Const Char_n As Integer = 110
Const Char_p As Integer = 112
Const Char_s As Integer = 115
Const Char_z As Integer = 122
Const CharDash As Integer = 45
Const CharSemiC As Integer = 59
<Microsoft.SqlServer.Server.SqlFunction( _
DataAccess:=DataAccessKind.None _
, IsDeterministic:=True _
, IsPrecise:=True)> _
Public Shared Function HTMLCleaner(ByVal chars As SqlTypes.SqlBytes) As SqlTypes.SqlBytes
Dim b As Byte
Dim i As Integer, j As Integer
Dim Out As Byte()
Dim State As States = States.Space1
Dim Substate As SubStates = SubStates.None
Dim strAccum As String = ""
ReDim Out(0 To chars.Length - 1)
For i = 0 To chars.Length - 1
b = chars(i)
Select Case State
Case States.Norm
Select Case b
Case CharA To CharZ, Char_a To Char_z
Out(j) = b
j = j + 1
State = States.Norm
Case CharSpace
Out(j) = b
j = j + 1
State = States.Space1
Case CharAmp
State = States.Entity 'skip output'
Case CharLT
Out(j) = CharSpace
j = j + 1
State = States.HTMLTag
Case CharDash
Out(j) = b
j = j + 1
State = States.Norm
Case Else
State = States.Norm 'skip output'
End Select
Case States.Space1
Select Case b
Case CharSpace
'discard leading & multiple spaces'
Case CharAmp
State = States.Entity 'skip output'
Case CharLT
Out(j) = CharSpace
j = j + 1
State = States.HTMLTag
Case CharDash
Out(j) = b
j = j + 1
State = States.Norm
Case CharA To CharZ, Char_a To Char_z
Out(j) = b
j = j + 1
State = States.Word1
Case Else
State = States.Norm 'skip output'
End Select
Case States.Word1
Select Case b
Case CharSpace
'single char word, retract from output:'
j = j - 1
State = States.Space1
Case CharAmp
State = States.Entity 'skip output'
Case CharLT
Out(j) = CharSpace
j = j + 1
State = States.HTMLTag
Case CharDash
Out(j) = b
j = j + 1
State = States.Norm
Case CharA To CharZ, Char_a To Char_z
Out(j) = b
j = j + 1
State = States.Norm
Case Else
State = States.Norm 'skip output'
End Select
Case States.Entity
Select Case b
Case CharSemiC
'End of entity, wrap it up:'
If strAccum = "nbsp" Then
Out(j) = CharSpace
j = j + 1
strAccum = ""
State = States.Space1
Else
'output "X"'
Out(j) = CharX
j = j + 1
State = States.Norm
End If
Case Else
'else, keep scanning for semicolon...'
' accumulate entity chars:'
strAccum = strAccum & b
End Select
Case States.HTMLTag
If b = CharGT Then
If strAccum = "SCRIPT" Then
strAccum = ""
State = States.Script
Substate = SubStates.None
ElseIf strAccum = "STYLE" Then
strAccum = ""
State = States.Style
Substate = SubStates.None
Else
Out(j) = CharSpace
j = j + 1
State = States.Space1
strAccum = ""
End If
Else
'accumulate tag name'
strAccum = strAccum & b
End If
Case States.Script
Select Case Substate
Case SubStates.None
If b = CharGT Then
Substate = SubStates.EndBegin
End If
Case SubStates.EndBegin
If b = CharSlash Then
Substate = SubStates.EndSlash
strAccum = ""
Else
Substate = SubStates.None
End If
Case SubStates.EndSlash
If b = CharGT Then
If strAccum = "SCRIPT" Then
'end of script found; output nothing'
State = States.Norm
Substate = SubStates.None
Else
'false alarm, back to script-scanning'
Substate = SubStates.None
End If
Else
'accumulate the end-tags label'
strAccum = strAccum & b
End If
End Select
Case States.Style
Select Case Substate
Case SubStates.None
If b = CharGT Then
Substate = SubStates.EndBegin
End If
Case SubStates.EndBegin
If b = CharSlash Then
Substate = SubStates.EndSlash
strAccum = ""
Else
Substate = SubStates.None
End If
Case SubStates.EndSlash
If b = CharGT Then
If strAccum = "STYLE" Then
'end of script found; output nothing'
State = States.Norm
Substate = SubStates.None
Else
'false alarm, back to script-scanning'
Substate = SubStates.None
End If
Else
'accumulate the end-tags label'
strAccum = strAccum & b
End If
End Select
Case Else
Debug.Assert(1 = 0)
End Select
'extra check for multiple spaces'
If j > 1 _
AndAlso (Out(j - 1) = CharSpace _
And Out(j - 2) = CharSpace) Then
j = j - 1 'roll back the last character'
ElseIf j = 1 AndAlso Out(0) = CharSpace Then
j = 0 'overwrite leading space'
End If
Next
'remove any trailing space:'
If j > 0 AndAlso Out(j - 1) = CharSpace Then j = j - 1
'trim off the trailing excess'
ReDim Preserve Out(0 To j - 1)
Return New SqlBytes(Out)
End Function
<Microsoft.SqlServer.Server.SqlFunction( _
DataAccess:=DataAccessKind.None _
, IsDeterministic:=True _
, IsPrecise:=True)> _
Public Shared Function HTMLCopy2(ByVal chars As SqlTypes.SqlBytes) As SqlTypes.SqlBytes
Dim out() As Byte
ReDim out(0 To chars.Length - 1)
For i As Integer = 0 To chars.Length - 1
out(i) = chars.Buffer(i)
Next
Return New SqlBytes(out)
End Function
<Microsoft.SqlServer.Server.SqlFunction( _
DataAccess:=DataAccessKind.None _
, IsDeterministic:=True _
, IsPrecise:=True)> _
Public Shared Function HTMLCopy(ByVal chars As SqlTypes.SqlBytes) As SqlTypes.SqlBytes
' Add your code here'
Return New SqlTypes.SqlBytes(chars.Buffer)
End Function
End Class