使用join替换varchar(max)字段中的值

时间:2013-07-02 22:07:36

标签: sql sql-server sql-server-2008-r2

我有一个包含带占位符的文本字段的表。像这样:

Row Notes  
1.  This is some notes ##placeholder130## this ##myPlaceholder##, #oneMore#. End.
2.  Second row...just a ##test#.   

(此表平均包含大约1-5k行。一行中占位符的平均数为5-15)。

现在,我有一个如下所示的查找表:

Name             Value
placeholder130    Dog
myPlaceholder     Cat
oneMore           Cow
test              Horse   

(查找表将包含10k到100k记录中的任何内容)

我需要找到将这些占位符从字符串连接到查找表并用值替换的最快方法。所以,我的结果应该是这样的(第1行):

  

这是一些注释狗这只猫,牛。端。

我想到的是为每个占位符将每行拆分为多个,然后将其连接到查找表,然后将记录连续回原始行,但是平均需要大约10-30秒。

7 个答案:

答案 0 :(得分:9)

您可以尝试使用数字表拆分字符串,然后使用for xml path重建字符串。

select (
       select coalesce(L.Value, T.Value)
       from Numbers as N
         cross apply (select substring(Notes.notes, N.Number, charindex('##', Notes.notes + '##', N.Number) - N.Number)) as T(Value)
         left outer join Lookup as L
           on L.Name = T.Value
       where N.Number <= len(notes) and
             substring('##' + notes, Number, 2) = '##'
       order by N.Number
       for xml path(''), type
       ).value('text()[1]', 'varchar(max)')
from Notes

SQL Fiddle

我从this blog post by Aaron Bertrand

借用了字符串拆分

答案 1 :(得分:6)

SQL Server的字符串操作速度不是很快,所以这可能是客户端最好的。让客户端加载整个查找表,并在它们到达时替换它们。

话虽如此,它当然可以在SQL中完成。这是一个带有递归CTE的解决方案。它每个递归步骤执行一次查找:

; with  Repl as
        (
        select  row_number() over (order by l.name) rn
        ,       Name
        ,       Value
        from    Lookup l
        )
,       Recurse as
        (
        select  Notes
        ,       0 as rn
        from    Notes
        union all
        select  replace(Notes, '##' + l.name + '##', l.value)
        ,       r.rn + 1
        from    Recurse r
        join    Repl l
        on      l.rn = r.rn + 1
        )
select  *
from    Recurse
where   rn = 
        (
        select  count(*)
        from    Lookup
        )
option  (maxrecursion 0)

Example at SQL Fiddle.

另一个选项是while循环来继续替换查找,直到找不到更多:

declare @notes table (notes varchar(max))

insert  @notes
select  Notes
from    Notes

while 1=1
    begin

    update  n
    set     Notes = replace(n.Notes, '##' + l.name + '##', l.value)
    from    @notes n
    outer apply
            (
            select  top 1 Name
            ,       Value
            from    Lookup l
            where   n.Notes like '%##' + l.name + '##%'
            ) l
    where   l.name is not null

    if @@rowcount = 0
        break
    end   

select  *
from    @notes

Example at SQL Fiddle.

答案 2 :(得分:4)

试试这个

;WITH CTE (org, calc, [Notes], [level]) AS
(
    SELECT [Notes], [Notes], CONVERT(varchar(MAX),[Notes]), 0 FROM PlaceholderTable

    UNION ALL

    SELECT  CTE.org, CTE.[Notes],
        CONVERT(varchar(MAX), REPLACE(CTE.[Notes],'##' + T.[Name] + '##', T.[Value])), CTE.[level] + 1
    FROM    CTE
    INNER JOIN LookupTable T ON CTE.[Notes] LIKE '%##' + T.[Name] + '##%'

)

SELECT DISTINCT org, [Notes], level FROM CTE
WHERE [level] = (SELECT MAX(level) FROM CTE c WHERE CTE.org = c.org)

SQL FIDDLE DEMO

查看以下devioblog帖子以供参考

devioblog post

答案 3 :(得分:4)

我认为tsql不适合这个操作的注释,但是如果你必须在db中这样做,这是一个使用函数来管理多个替换语句的例子。

由于每个音符(5-15)中的令牌数量相对较少且令牌数量非常大(10k-100k),因此我的函数首先从输入中提取标记作为 potential 标记并使用该集合加入您的查找(下面的dbo.Token)。在每个音符中查找任何你的标记的工作量太多了。

我使用50k令牌和5k音符进行了一些性能测试,这个功能运行得非常好,完成时间<2秒(在我的笔记本电脑上)。请报告此策略如何为您执行。

注意:在您的示例数据中,令牌格式不一致(##_#, ##_##, #_#),我猜这只是一个拼写错误,并假设所有令牌都采用## TokenName的形式# #。

--setup
    if object_id('dbo.[Lookup]') is not null
        drop table dbo.[Lookup];
    go
    if object_id('dbo.fn_ReplaceLookups') is not null
        drop function dbo.fn_ReplaceLookups;
    go

    create table dbo.[Lookup] (LookupName varchar(100) primary key, LookupValue varchar(100));
    insert into dbo.[Lookup]
        select '##placeholder130##','Dog' union all
        select '##myPlaceholder##','Cat' union all
        select '##oneMore##','Cow' union all
        select '##test##','Horse';
    go

    create function [dbo].[fn_ReplaceLookups](@input varchar(max))
    returns varchar(max)
    as
    begin

        declare @xml xml;
        select @xml = cast(('<r><i>'+replace(@input,'##' ,'</i><i>')+'</i></r>') as xml);

        --extract the potential tokens
        declare @LookupsInString table (LookupName varchar(100) primary key);
        insert into @LookupsInString
            select  distinct '##'+v+'##'
            from    (   select  [v] = r.n.value('(./text())[1]', 'varchar(100)'),
                                [r] = row_number() over (order by n)
                        from    @xml.nodes('r/i') r(n)
                    )d(v,r)
            where   r%2=0;

        --tokenize the input
        select  @input = replace(@input, l.LookupName, l.LookupValue)
        from    dbo.[Lookup] l
        join    @LookupsInString lis on 
                l.LookupName = lis.LookupName;

        return @input;
    end
    go          
    return            

--usage
    declare @Notes table ([Id] int primary key, notes varchar(100));
    insert into @Notes
        select 1, 'This is some notes ##placeholder130## this ##myPlaceholder##, ##oneMore##. End.' union all
        select 2, 'Second row...just a ##test##.';

    select  *,
            dbo.fn_ReplaceLookups(notes)
    from    @Notes;

返回:

Tokenized
--------------------------------------------------------
This is some notes Dog this Cat, Cow. End.
Second row...just a Horse.

答案 4 :(得分:1)

为了提高速度,您可以将注释模板预处理为更有效的形式。这将是一系列片段,每个片段以替换结束。对于最后一个片段,替换可能为NULL。

Notes
Id     FragSeq    Text                    SubsId
1      1          'This is some notes '   1
1      2          ' this '                2
1      3          ', '                    3
1      4          '. End.'                null
2      1          'Second row...just a '  4
2      2          '.'                     null

Subs
Id  Name               Value
1   'placeholder130'   'Dog'
2   'myPlaceholder'    'Cat'
3   'oneMore'          'Cow'
4   'test'             'Horse'  

现在我们可以通过简单的连接进行替换。

SELECT Notes.Text + COALESCE(Subs.Value, '') 
FROM Notes LEFT JOIN Subs 
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq

这将生成一个替换完成的片段列表。我不是MSQL用户,但在大多数SQL方言中,您可以很容易地将这些片段连接到变量中:

DECLARE @Note VARCHAR(8000)
SELECT @Note = COALESCE(@Note, '') + Notes.Text + COALSCE(Subs.Value, '') 
FROM Notes LEFT JOIN Subs 
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq

使用其他帖子的字符串拆分技术,将笔记模板预处理成片段将非常简单。

不幸的是,我不在我可以测试它的位置,但它应该可以正常工作。

答案 5 :(得分:0)

我真的不知道它将如何通过10k +的查找来执行。 旧的动态SQL如何执行?

DECLARE @sqlCommand  NVARCHAR(MAX)
SELECT @sqlCommand  = N'PlaceholderTable.[Notes]'

SELECT @sqlCommand  = 'REPLACE( ' + @sqlCommand  + 
                      ', ''##' + LookupTable.[Name] + '##'', ''' + 
                      LookupTable.[Value] + ''')'  
FROM LookupTable

SELECT @sqlCommand  = 'SELECT *, ' + @sqlCommand  + ' FROM PlaceholderTable'

EXECUTE sp_executesql @sqlCommand

Fiddle demo

答案 6 :(得分:0)

现在进行一些递归CTE。

如果您的索引设置正确,那么这个索引应该非常快非常慢。当谈到r-CTE时,SQL Server总是让我惊讶于极端性能......

;WITH T AS (
  SELECT
    Row,
    StartIdx = 1,                                  -- 1 as first starting index
    EndIdx = CAST(patindex('%##%', Notes) as int), -- first ending index
    Result = substring(Notes, 1, patindex('%##%', Notes) - 1)
                                                   -- (first) temp result bounded by indexes
  FROM PlaceholderTable -- **this is your source table**
  UNION ALL
  SELECT
    pt.Row,
    StartIdx = newstartidx,                        -- starting index (calculated in calc1)
    EndIdx = EndIdx + CAST(newendidx as int) + 1,  -- ending index (calculated in calc4 + total offset)
    Result = Result + CAST(ISNULL(newtokensub, newtoken) as nvarchar(max))
                                                   -- temp result taken from subquery or original
  FROM 
    T
    JOIN PlaceholderTable pt -- **this is your source table**
      ON pt.Row = T.Row
    CROSS APPLY(
      SELECT newstartidx = EndIdx + 2              -- new starting index moved by 2 from last end ('##')
    ) calc1
    CROSS APPLY(
      SELECT newtxt = substring(pt.Notes, newstartidx, len(pt.Notes))
                                                   -- current piece of txt we work on
    ) calc2
    CROSS APPLY(
      SELECT patidx = patindex('%##%', newtxt)     -- current index of '##'
    ) calc3
    CROSS APPLY(
      SELECT newendidx = CASE 
        WHEN patidx = 0 THEN len(newtxt) + 1
        ELSE patidx END                            -- if last piece of txt, end with its length
    ) calc4
    CROSS APPLY(
      SELECT newtoken = substring(pt.Notes, newstartidx, newendidx - 1)
                                                   -- get the new token
    ) calc5
    OUTER APPLY(
      SELECT newtokensub = Value
      FROM LookupTable
      WHERE Name = newtoken                        -- substitute the token if you can find it in **your lookup table**
    ) calc6
  WHERE newstartidx + len(newtxt) - 1  <= len(pt.Notes)  
                                                   -- do this while {new starting index} + {length of txt we work on} exceeds total length
) 
,lastProcessed AS (
  SELECT 
    Row, 
    Result,
    rn = row_number() over(partition by Row order by StartIdx desc)
  FROM T 
)                                                  -- enumerate all (including intermediate) results
SELECT *
FROM lastProcessed
WHERE rn = 1                                       -- filter out intermediate results (display only last ones)