Warning: file_get_contents(/data/phpspider/zhask/data//catemap/5/sql/69.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
SQL文本过滤_Sql_Sql Server_Tsql - Fatal编程技术网

SQL文本过滤

SQL文本过滤,sql,sql-server,tsql,Sql,Sql Server,Tsql,我想从数据集中过滤出几个关键短语。不幸的是,到目前为止,我能想到的唯一算法是嵌套的replace语句,例如: SELECT REPLACE( REPLACE(FIELDNAME,'</u>','') ,'<u>','') 选择 替换( 替换(字段名,,“”) ,'','') 其中FIELDNAME是存储在表中的原始HTML代码。正如你所看到的,这是可怕的。有更好的想法吗?我认为在TSQL中没有更好的方法 如果您

我想从数据集中过滤出几个关键短语。不幸的是,到目前为止,我能想到的唯一算法是嵌套的replace语句,例如:

SELECT 
    REPLACE(
            REPLACE(FIELDNAME,'</u>','')
            ,'<u>','')
选择
替换(
替换(字段名,,“”)
,'','')

其中FIELDNAME是存储在表中的原始HTML代码。正如你所看到的,这是可怕的。有更好的想法吗?

我认为在TSQL中没有更好的方法


如果您在SQL层之上有另一个环境(例如asp.net),您可能会更幸运地在这方面进行过滤。

我认为TSQL中没有更好的方法


如果您在SQL层上有另一个环境(例如asp.net),您可能会更幸运地在其中进行过滤。

此类字符串操作最好由处理。

此类字符串操作最好由处理。

讨论了类似但更复杂的“清理”问题HTML文本根据规则列表(您需要在此网站注册,但仅此而已)。讨论中包括了几种使用T-SQL的方法,以及一种使用SQLCLR的方法,这是迄今为止最快的方法。在我编写VB.Net/SQLCLR解决方案时,我将它包括在下面

以下是它实现的文本替换/转换:

  • 删除脚本标记和内容
  • 用空格替换所有HTML标记
  • 替换为空格
  • 将所有实体代码(“&xxx;”)替换为X
  • 替换所有标点符号和数学符号(,;:“&()[]+/≥≤°÷)带空格(虚线不替换)添加百分比、反斜杠、下划线、克拉、星号、等号、大括号、问号和感叹号、管道、美元和美分符号、磅符号、制表符、crlf
  • 用空格替换所有数字
  • 用空格替换所有单字母单词(模式:空格单字符通配符空格)
  • 删除多余的空间
  • 它实现了一种文本转换器类型的DFSA(确定性有限状态自动机,嗯,它几乎是确定性的,因为它确实在几个地方展望未来):

    导入系统
    导入系统数据
    导入System.Data.SqlClient
    导入System.Data.SqlTypes
    导入Microsoft.SqlServer.Server
    部分公共类UserDefinedFunctions
    公共枚举状态
    空间1
    实体
    HTMLTag
    标准
    字1
    剧本
    风格
    结束枚举
    枚举子状态
    没有一个
    结束开始
    尾端斜线
    结束枚举
    常量字符空间为整数=32
    常量字符作为整数=38
    常量字符斜杠为整数=47
    常量字符为整数=60
    常量CharGT为整数=62
    常量字符为整数=65
    常量字符为整数=88
    常量CharZ为整数=90
    常量字符a为整数=97
    常量字符b为整数=98
    常量字符作为整数=110
    常量字符p为整数=112
    常量字符作为整数=115
    常量Char_z为整数=122
    常量CharDash为整数=45
    常量CharSemiC为整数=59
    _
    公共共享函数HTMLCleaner(ByVal字符作为SqlTypes.SqlBytes)作为SqlTypes.SqlBytes
    作为字节的Dim b
    尺寸i为整数,j为整数
    变暗为字节()
    将状态变暗为状态=状态。空间1
    将子状态标注为子状态=子状态。无
    Dim strAccum As String=“”
    重拨出(0到字符长度-1)
    对于i=0到字符长度-1
    b=字符(i)
    选择案例状态
    案例说明。规范
    选择案例b
    案例CharA至CharZ,Char_a至Char_z
    Out(j)=b
    j=j+1
    状态=状态。规范
    格值空间
    Out(j)=b
    j=j+1
    State=States.Space1
    案例查拉姆
    状态=状态。实体“跳过输出”
    箱字符
    Out(j)=CharSpace
    j=j+1
    State=States.HTMLTag
    Case CharDash
    Out(j)=b
    j=j+1
    状态=状态。规范
    其他情况
    状态=状态。规范“跳过输出”
    结束选择
    案例2.1
    选择案例b
    格值空间
    “放弃前导空格和多个空格”
    案例查拉姆
    状态=状态。实体“跳过输出”
    箱字符
    Out(j)=CharSpace
    j=j+1
    State=States.HTMLTag
    Case CharDash
    Out(j)=b
    j=j+1
    状态=状态。规范
    案例CharA至CharZ,Char_a至Char_z
    Out(j)=b
    j=j+1
    State=States.Word1
    其他情况
    状态=状态。规范“跳过输出”
    结束选择
    案例1.1
    选择案例b
    格值空间
    '单字符字,从输出缩回:'
    j=j-1
    State=States.Space1
    案例查拉姆
    状态=状态。实体“跳过输出”
    箱字符
    Out(j)=CharSpace
    j=j+1
    State=States.HTMLTag
    Case CharDash
    
    Imports System
    Imports System.Data
    Imports System.Data.SqlClient
    Imports System.Data.SqlTypes
    Imports Microsoft.SqlServer.Server
    
    Partial Public Class UserDefinedFunctions
    
        Public Enum States
            Space1
            Entity
            HTMLTag
            Norm
            Word1
            Script
            Style
        End Enum
        Enum SubStates
            None
            EndBegin
            EndSlash
        End Enum
        Const CharSpace As Integer = 32
        Const CharAmp As Integer = 38
        Const CharSlash As Integer = 47
        Const CharLT As Integer = 60
        Const CharGT As Integer = 62
        Const CharA As Integer = 65
        Const CharX As Integer = 88
        Const CharZ As Integer = 90
        Const Char_a As Integer = 97
        Const Char_b As Integer = 98
        Const Char_n As Integer = 110
        Const Char_p As Integer = 112
        Const Char_s As Integer = 115
        Const Char_z As Integer = 122
        Const CharDash As Integer = 45
        Const CharSemiC As Integer = 59
    
        <Microsoft.SqlServer.Server.SqlFunction( _
                DataAccess:=DataAccessKind.None _
                , IsDeterministic:=True _
                , IsPrecise:=True)> _
        Public Shared Function HTMLCleaner(ByVal chars As SqlTypes.SqlBytes) As SqlTypes.SqlBytes
            Dim b As Byte
            Dim i As Integer, j As Integer
            Dim Out As Byte()
            Dim State As States = States.Space1
            Dim Substate As SubStates = SubStates.None
            Dim strAccum As String = ""
    
            ReDim Out(0 To chars.Length - 1)
    
            For i = 0 To chars.Length - 1
                b = chars(i)
                Select Case State
    
                    Case States.Norm
                        Select Case b
                            Case CharA To CharZ, Char_a To Char_z
                                Out(j) = b
                                j = j + 1
                                State = States.Norm
                            Case CharSpace
                                Out(j) = b
                                j = j + 1
                                State = States.Space1
                            Case CharAmp
                                State = States.Entity 'skip output'
                            Case CharLT
                                Out(j) = CharSpace
                                j = j + 1
                                State = States.HTMLTag
                            Case CharDash
                                Out(j) = b
                                j = j + 1
                                State = States.Norm
                            Case Else
                                State = States.Norm 'skip output'
                        End Select
    
                    Case States.Space1
                        Select Case b
                            Case CharSpace
                                'discard leading & multiple spaces'
                            Case CharAmp
                                State = States.Entity 'skip output'
                            Case CharLT
                                Out(j) = CharSpace
                                j = j + 1
                                State = States.HTMLTag
                            Case CharDash
                                Out(j) = b
                                j = j + 1
                                State = States.Norm
                            Case CharA To CharZ, Char_a To Char_z
                                Out(j) = b
                                j = j + 1
                                State = States.Word1
                            Case Else
                                State = States.Norm 'skip output'
                        End Select
    
                    Case States.Word1
                        Select Case b
                            Case CharSpace
                                'single char word, retract from output:'
                                j = j - 1
                                State = States.Space1
                            Case CharAmp
                                State = States.Entity 'skip output'
                            Case CharLT
                                Out(j) = CharSpace
                                j = j + 1
                                State = States.HTMLTag
                            Case CharDash
                                Out(j) = b
                                j = j + 1
                                State = States.Norm
                            Case CharA To CharZ, Char_a To Char_z
                                Out(j) = b
                                j = j + 1
                                State = States.Norm
                            Case Else
                                State = States.Norm 'skip output'
                        End Select
    
                    Case States.Entity
                        Select Case b
                            Case CharSemiC
                                'End of entity, wrap it up:'
                                If strAccum = "nbsp" Then
                                    Out(j) = CharSpace
                                    j = j + 1
                                    strAccum = ""
                                    State = States.Space1
                                Else
                                    'output "X"'
                                    Out(j) = CharX
                                    j = j + 1
                                    State = States.Norm
                                End If
                            Case Else
                                'else, keep scanning for semicolon...'
                                ' accumulate entity chars:'
                                strAccum = strAccum & b
                        End Select
    
                    Case States.HTMLTag
                        If b = CharGT Then
                            If strAccum = "SCRIPT" Then
                                strAccum = ""
                                State = States.Script
                                Substate = SubStates.None
                            ElseIf strAccum = "STYLE" Then
                                strAccum = ""
                                State = States.Style
                                Substate = SubStates.None
                            Else
                                Out(j) = CharSpace
                                j = j + 1
                                State = States.Space1
                                strAccum = ""
                            End If
                        Else
                            'accumulate tag name'
                            strAccum = strAccum & b
                        End If
    
                    Case States.Script
                        Select Case Substate
                            Case SubStates.None
                                If b = CharGT Then
                                    Substate = SubStates.EndBegin
                                End If
                            Case SubStates.EndBegin
                                If b = CharSlash Then
                                    Substate = SubStates.EndSlash
                                    strAccum = ""
                                Else
                                    Substate = SubStates.None
                                End If
                            Case SubStates.EndSlash
                                If b = CharGT Then
                                    If strAccum = "SCRIPT" Then
                                        'end of script found; output nothing'
                                        State = States.Norm
                                        Substate = SubStates.None
                                    Else
                                        'false alarm, back to script-scanning'
                                        Substate = SubStates.None
                                    End If
                                Else
                                    'accumulate the end-tags label'
                                    strAccum = strAccum & b
                                End If
                        End Select
    
                    Case States.Style
                        Select Case Substate
                            Case SubStates.None
                                If b = CharGT Then
                                    Substate = SubStates.EndBegin
                                End If
                            Case SubStates.EndBegin
                                If b = CharSlash Then
                                    Substate = SubStates.EndSlash
                                    strAccum = ""
                                Else
                                    Substate = SubStates.None
                                End If
                            Case SubStates.EndSlash
                                If b = CharGT Then
                                    If strAccum = "STYLE" Then
                                        'end of script found; output nothing'
                                        State = States.Norm
                                        Substate = SubStates.None
                                    Else
                                        'false alarm, back to script-scanning'
                                        Substate = SubStates.None
                                    End If
                                Else
                                    'accumulate the end-tags label'
                                    strAccum = strAccum & b
                                End If
                        End Select
    
                    Case Else
                        Debug.Assert(1 = 0)
                End Select
    
                'extra check for multiple spaces'
                If j > 1 _
                    AndAlso (Out(j - 1) = CharSpace _
                    And Out(j - 2) = CharSpace) Then
                    j = j - 1   'roll back the last character'
                ElseIf j = 1 AndAlso Out(0) = CharSpace Then
                    j = 0   'overwrite leading space'
                End If
    
            Next
    
            'remove any trailing space:'
            If j > 0 AndAlso Out(j - 1) = CharSpace Then j = j - 1
            'trim off the trailing excess'
            ReDim Preserve Out(0 To j - 1)
    
            Return New SqlBytes(Out)
        End Function
    
        <Microsoft.SqlServer.Server.SqlFunction( _
                DataAccess:=DataAccessKind.None _
                , IsDeterministic:=True _
                , IsPrecise:=True)> _
        Public Shared Function HTMLCopy2(ByVal chars As SqlTypes.SqlBytes) As SqlTypes.SqlBytes
            Dim out() As Byte
            ReDim out(0 To chars.Length - 1)
    
            For i As Integer = 0 To chars.Length - 1
                out(i) = chars.Buffer(i)
            Next
    
            Return New SqlBytes(out)
        End Function
    
        <Microsoft.SqlServer.Server.SqlFunction( _
                DataAccess:=DataAccessKind.None _
                , IsDeterministic:=True _
                , IsPrecise:=True)> _
        Public Shared Function HTMLCopy(ByVal chars As SqlTypes.SqlBytes) As SqlTypes.SqlBytes
            ' Add your code here'
            Return New SqlTypes.SqlBytes(chars.Buffer)
        End Function
    
    End Class