c# – SQL Server匹配单词短语和顺序相关性的最佳方法
|
通过参数中单词的数量(计数)/匹配对sql varchar列进行排名的最佳方法是什么,具有四个不同的唯一标准.这可能不是一个微不足道的问题,但我挑战根据“最佳匹配”使用我的标准来排序行.
专栏:description varchar(100) 使用此订单首选项输出: >完全匹配(整个字符串匹配) – 始终是第一个 单词可能不完全匹配,因为单词的部分匹配是允许的,并且可能,lessor值应该应用于部分单词以进行排名但不是关键的(pot将匹配每个:pot,potter,potholder,depot,depotting).与其他单词匹配开始应该排名高于没有后续匹配的那些,但这不是交易杀手/超级重要. 我想有一个方法来排列列“以”开头“参数中的值的位置.说我有以下字符串: 'This is my value string as a test template to rank on.' 在第一种情况下,我希望得到最大字数存在的列/行的等级. 并且第二个基于出现时的出现(最佳匹配)排名为: 'This is my string as a test template to rank on.' - first 'This is my string as a test template to rank on even though not exact.'-second 'This is my string as a test template to rank' - third 'This is my string as a test template to' - next 'This is my string as a test template' - next etc. 其次:(可能是第一组(开始)之后的第二组/数据组 – 这是期望的 我想按@MyParameter中出现的@MyParameter中的单词计数排序(排序)行,其中连续单词的排名高于相同的单独计数. 因此,对于上面的示例字符串,’我所显示的字符串’将排名高于’不是我的其他字符串’,因为连续字符串(单词在一起)的“更好匹配”具有相同的字数.具有较高匹配(发生的单词数)的行将首先排名降序最佳匹配. 如果可能的话,我想在一个查询中执行此操作. 结果中不应出现两次行. 出于性能考虑,表中不会有超过10,000行. 表中的值是相当静态的,几乎没有变化,但并非完全如此. 我此时无法更改结构,但稍后会考虑(如单词/短语表) 为了使这个稍微复杂一点,单词列表在两个表中 – 但是我可以为它创建一个视图,但是一个表结果(较小的列表)应该在第二个之前发生,更大的数据集结果给出相同的匹配 – 将会从这些表以及表中重复,我只想要不同的值.选择DISTINCT并不容易,因为我想返回一个列(sourceTable),它很可能使行不同,在这种情况下只从第一个(较小的)表中选择,但是所有其他列DISTINCT都是需要的(不要考虑“独特”评估中的专栏. 表中的Psuedo列: procedureCode VARCHAR(50),description VARCHAR(100),-- this is the sort/evaluation column category VARCHAR(50),relvu VARCHAR(50),charge VARCHAR(15),active bit sourceTable VARCHAR(50) - just shows which table it comes from of the two 没有像ID列那样存在唯一索引 匹配不在第三个表中排除SELECT * FROM(select * from tableone,其中procedureCode不在(从tablethree中选择procedureCode)) 编辑:为了解决这个问题,我创建了一个像这样的表值参数: 0 Gastric Intubation & Aspiration/Lavage,Treatmen 1 Gastric%Intubation%Aspiration%Lavage%Treatmen 2 Gastric%Intubation%Aspiration%Lavage 3 Gastric%Intubation%Aspiration 4 Gastric%Intubation 5 Gastric 6 Intubation%Aspiration%Lavage%Treatmen 7 Intubation%Aspiration%Lavage 8 Intubation%Aspiration 9 Intubation 10 Aspiration%Lavage%Treatmen 11 Aspiration%Lavage 12 Aspiration 13 Lavage%Treatmen 14 Lavage 15 Treatmen 实际短语在第0行的位置 这是我目前的尝试: CREATE PROCEDURE [GetProcedureByDescription]
(
@IncludeMaster BIT,@ProcedureSearchPhrases CPTFavorite READONLY
)
AS
DECLARE @myIncludeMaster BIT;
SET @myIncludeMaster = @IncludeMaster;
CREATE TABLE #DistinctMatchingCpts
(
procedureCode VARCHAR(50),description VARCHAR(100),category VARCHAR(50),rvu VARCHAR(50),charge VARCHAR(15),active VARCHAR(15),sourceTable VARCHAR(50),sequenceSet VARCHAR(2)
)
IF @myIncludeMaster = 0
BEGIN -- Excluding master from search
INSERT INTO #DistinctMatchingCpts (sourceTable,procedureCode,description,category,charge,active,rvu,sequenceSet
)
SELECT DISTINCT sourceTable,sequenceSet
FROM (
SELECT TOP 1
LTRIM(RTRIM(CPT.[CODE])) AS procedureCode,LTRIM(RTRIM(CPT.[LEVEL])) AS description,LTRIM(RTRIM(CPT.[COMBO])) AS category,LTRIM(RTRIM(CPT.[CHARGE])) AS charge,''True'' AS active,LTRIM(RTRIM([RVU])) AS rvu,''0CPTMore'' AS sourceTable,''01'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [CPTMORE] AS CPT
ON CPT.[LEVEL] = PP.[LEVEL]
WHERE
(CPT.[COMBO] IS NULL OR CPT.[COMBO] NOT IN (''Editor'',''MOD'',''CATEGORY'',''Types'',''Bundles''))
AND CPT.[CODE] IS NOT NULL
AND CPT.[CODE] NOT IN (''0'','''')
AND CPT.[CODE] NOT IN (SELECT CPTE.[CODE] FROM CPT AS CPTE WHERE CPTE.[CODE] IS NOT NULL)
ORDER BY PP.CODE
UNION ALL
SELECT
LTRIM(RTRIM(CPT.[CODE])) AS procedureCode,LTRIM(RTRIM([CHARGE])) AS charge,''02'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [CPTMORE] AS CPT
ON CPT.[LEVEL] LIKE PP.[LEVEL] + ''%''
WHERE
(CPT.[COMBO] IS NULL OR CPT.[COMBO] NOT IN (''Editor'','''')
AND CPT.[CODE] NOT IN (SELECT CPTE.[CODE] FROM CPT AS CPTE WHERE CPTE.[CODE] IS NOT NULL)
UNION ALL
SELECT
LTRIM(RTRIM(CPT.[CODE])) AS procedureCode,''03'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [CPTMORE] AS CPT
ON CPT.[LEVEL] LIKE ''%'' + PP.[LEVEL] + ''%''
WHERE
(CPT.[COMBO] IS NULL OR CPT.[COMBO] NOT IN (''Editor'','''')
AND CPT.[CODE] NOT IN (SELECT CPTE.[CODE] FROM CPT AS CPTE WHERE CPTE.[CODE] IS NOT NULL)
) AS CPTS
ORDER BY
procedureCode,sourceTable,[description]
END -- Excluded master from search
ELSE
BEGIN -- Including master in search,but present favorites before master for each code
-- Get matching procedures,ordered by code,source (favorites first),and description.
-- There probably will be procedures with duplicated code+description,so we will filter
-- duplicates shortly.
INSERT INTO #DistinctMatchingCpts (sourceTable,sequenceSet)
SELECT DISTINCT sourceTable,''00'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [CPTMORE] AS CPT
ON CPT.[LEVEL] = PP.[LEVEL]
WHERE
(CPT.[COMBO] IS NULL OR CPT.[COMBO] NOT IN (''Editor'','''')
AND CPT.[CODE] NOT IN (SELECT CPTE.[CODE] FROM CPT AS CPTE WHERE CPTE.[CODE] IS NOT NULL)
ORDER BY PP.CODE
UNION ALL
SELECT TOP 1
LTRIM(RTRIM(CPT.[CODE])) AS procedureCode,LTRIM(RTRIM(CPT.[CATEGORY])) AS category,COALESCE(CASE [ACTIVE] WHEN 1 THEN ''True'' WHEN 0 THEN ''False'' WHEN '''' THEN ''False'' ELSE ''False'' END,''True'') AS active,''2MasterCPT'' AS sourceTable,''00'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [MASTERCPT] AS CPT
ON CPT.[LEVEL] = PP.[LEVEL]
WHERE
CPT.[CODE] IS NOT NULL
AND CPT.[CODE] NOT IN (''0'','''')
AND CPT.[CODE] NOT IN (SELECT CPTE.[CODE] FROM CPT AS CPTE WHERE CPTE.[CODE] IS NOT NULL)
ORDER BY PP.CODE
UNION ALL
SELECT
LTRIM(RTRIM(CPT.[CODE])) AS procedureCode,''01'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [CPTMORE] AS CPT
ON CPT.[LEVEL] = PP.[LEVEL]
WHERE
(CPT.[COMBO] IS NULL OR CPT.[COMBO] NOT IN (''Editor'','''')
AND CPT.[CODE] NOT IN (SELECT CPTE.[CODE] FROM CPT AS CPTE WHERE CPTE.[CODE] IS NOT NULL)
UNION ALL
SELECT
LTRIM(RTRIM(CPT.[CODE])) AS procedureCode,''01'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [MASTERCPT] AS CPT
ON CPT.[LEVEL] = PP.[LEVEL]
WHERE
CPT.[CODE] IS NOT NULL
AND CPT.[CODE] NOT IN (''0'','''')
AND CPT.[CODE] NOT IN (SELECT CPTE.[CODE] FROM CPT AS CPTE WHERE CPTE.[CODE] IS NOT NULL)
UNION ALL
SELECT TOP 1
LTRIM(RTRIM(CPT.[CODE])) AS procedureCode,''02'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [CPTMORE] AS CPT
ON CPT.[LEVEL] LIKE PP.[LEVEL] + ''%''
WHERE
(CPT.[COMBO] IS NULL OR CPT.[COMBO] NOT IN (''Editor'',''02'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [MASTERCPT] AS CPT
ON CPT.[LEVEL] LIKE PP.[LEVEL] + ''%''
WHERE
CPT.[CODE] IS NOT NULL
AND CPT.[CODE] NOT IN (''0'',''03'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [CPTMORE] AS CPT
ON CPT.[LEVEL] LIKE PP.[LEVEL] + ''%''
WHERE
(CPT.[COMBO] IS NULL OR CPT.[COMBO] NOT IN (''Editor'',''03'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [MASTERCPT] AS CPT
ON CPT.[LEVEL] LIKE PP.[LEVEL] + ''%''
WHERE
CPT.[CODE] IS NOT NULL
AND CPT.[CODE] NOT IN (''0'',''04'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [CPTMORE] AS CPT
ON CPT.[LEVEL] LIKE ''%'' + PP.[LEVEL] + ''%''
WHERE
(CPT.[COMBO] IS NULL OR CPT.[COMBO] NOT IN (''Editor'',''04'' AS sequenceSet
FROM
@ProcedureSearchPhrases PP
INNER JOIN [MASTERCPT] AS CPT
ON CPT.[LEVEL] LIKE ''%'' + PP.[LEVEL] + ''%''
WHERE
CPT.[CODE] IS NOT NULL
AND CPT.[CODE] NOT IN (''0'','''')
AND CPT.[CODE] NOT IN (SELECT CPTE.[CODE] FROM CPT AS CPTE WHERE CPTE.[CODE] IS NOT NULL)
) AS CPTS
ORDER BY
sequenceSet,[description]
END
/* Final select - uses artificial ordering from the insertion ORDER BY */
SELECT procedureCode,active FROM
(
SELECT TOP 500 *-- procedureCode,active
FROM #DistinctMatchingCpts
ORDER BY sequenceSet,description
) AS CPTROWS
DROP TABLE #DistinctMatchingCpts
但是,这不符合单词计数的最佳匹配标准(如样本中的第1行值),该标准应与从该行中找到的最佳(大多数)单词匹配. 如果这会产生影响,我可以完全控制表值参数的格式/格式. 我将此结果返回给c#程序,如果它有用的话. 解决方法
您需要能够拆分字符串来解决此问题.
I prefer the number table approach to split a string in TSQL
为了我的下面的代码工作(以及我的分割功能),你需要这样做一次表设置: SELECT TOP 10000 IDENTITY(int,1,1) AS Number
INTO Numbers
FROM sys.objects s1
CROSS JOIN sys.objects s2
ALTER TABLE Numbers ADD CONSTRAINT PK_Numbers PRIMARY KEY CLUSTERED (Number)
设置Numbers表后,创建此拆分功能: CREATE FUNCTION [dbo].[FN_ListToTable]
(
@SplitOn char(1) --REQUIRED,the character to split the @List string on,@List varchar(8000)--REQUIRED,the list to split apart
)
RETURNS TABLE
AS
RETURN
(
----------------
--SINGLE QUERY-- --this will not return empty rows
----------------
SELECT
ListValue
FROM (SELECT
LTRIM(RTRIM(SUBSTRING(List2,number+1,CHARINDEX(@SplitOn,List2,number+1)-number - 1))) AS ListValue
FROM (
SELECT @SplitOn + @List + @SplitOn AS List2
) AS dt
INNER JOIN Numbers n ON n.Number < LEN(dt.List2)
WHERE SUBSTRING(List2,number,1) = @SplitOn
) dt2
WHERE ListValue IS NOT NULL AND ListValue!=''
);
GO
您可以随意创建自己的分割功能,但仍需要Numbers表才能使我的解决方案正常工作. 您现在可以轻松地将CSV字符串拆分为表格并加入其中: select * from dbo.FN_ListToTable(',','1,2,3,4,5,6777,')
OUTPUT: ListValue ----------------------- 1 2 3 4 5 6777 (6 row(s) affected) 现在尝试这个: DECLARE @BaseTable table (RowID int primary key,RowValue varchar(100))
set nocount on
INSERT @BaseTable VALUES ( 1,'The cows came home empty handed')
INSERT @BaseTable VALUES ( 2,'This is my string as a test template to rank') -- third
INSERT @BaseTable VALUES ( 3,'pencil pen paperclip eraser')
INSERT @BaseTable VALUES ( 4,'wow')
INSERT @BaseTable VALUES ( 5,'no dice here')
INSERT @BaseTable VALUES ( 6,'This is my string as a test template to rank on even though not exact.') -- second
INSERT @BaseTable VALUES ( 7,'apple banana pear grape lemon orange kiwi strawberry peach watermellon')
INSERT @BaseTable VALUES ( 8,'This is my string as a test template') -- 5th
INSERT @BaseTable VALUES ( 9,'rat cat bat mat sat fat hat pat ')
INSERT @BaseTable VALUES (10,'house home pool roll')
INSERT @BaseTable VALUES (11,'This is my string as a test template to') -- 4th
INSERT @BaseTable VALUES (12,'talk wisper yell scream sing hum')
INSERT @BaseTable VALUES (13,'This is my string as a test template to rank on.') -- first
INSERT @BaseTable VALUES (14,'aaa bbb ccc ddd eee fff ggg hhh')
INSERT @BaseTable VALUES (15,'three twice three once twice three')
set nocount off
DECLARE @SearchValue varchar(100)
SET @SearchValue='This is my value string as a test template to rank on.'
;WITH SplitBaseTable AS --expand each @BaseTable row into one row per word
(SELECT
b.RowID,b.RowValue,s.ListValue
FROM @BaseTable b
CROSS APPLY dbo.FN_ListToTable(' ',b.RowValue) AS s
),WordMatchCount AS --for each @BaseTable row that has has a word in common withe the search string,get the count of matching words
(SELECT
s.RowID,COUNT(*) AS CountOfWordMatch
FROM dbo.FN_ListToTable(' ',@SearchValue) v
INNER JOIN SplitBaseTable s ON v.ListValue=s.ListValue
GROUP BY s.RowID
HAVING COUNT(*)>0
),SearchLen AS --get one row for each possible length of the search string
(
SELECT
n.Number,SUBSTRING(@SearchValue,n.Number) AS PartialSearchValue
FROM Numbers n
WHERE n.Number<=LEN(@SearchValue)
),MatchLen AS --for each @BaseTable row,get the max starting length that matches the search string
(
SELECT
b.RowID,MAX(l.Number) MatchStartLen
FROM @BaseTable b
LEFT OUTER JOIN SearchLen l ON LEFT(b.RowValue,l.Number)=l.PartialSearchValue
GROUP BY b.RowID
)
SELECT --return the final search results
b.RowValue,w.CountOfWordMatch,m.MatchStartLen
FROM @BaseTable b
LEFT OUTER JOIN WordMatchCount w ON b.RowID=w.RowID
LEFT OUTER JOIN MatchLen m ON b.RowID=m.RowID
WHERE w.CountOfWordMatch>0
ORDER BY w.CountOfWordMatch DESC,m.MatchStartLen DESC,LEN(b.RowValue) DESC,b.RowValue ASC
OUTPUT: RowValue CountOfWordMatch MatchStartLen ----------------------------------------------------------------------- ---------------- ------------- This is my string as a test template to rank on. 11 11 This is my string as a test template to rank on even though not exact. 10 11 This is my string as a test template to rank 10 11 This is my string as a test template to 9 11 This is my string as a test template 8 11 (5 row(s) affected) 它确实字符串单词的开头匹配有点不同,因为它查看匹配字符串开头的字符数. 一旦你开始工作,你可以尝试通过为SplitBaseTable创建一些静态索引表来优化它.可能在@BaseTable上使用触发器. (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
