CREATE
FUNCTION
dbo.fnGetNGramValidChars()
RETURNS
nvarchar(39)
AS
BEGIN
-- Approved Signs for N-Gram Index
-- Western capital letters + Space
-- Digits
-- Delimiter sign: !
-- Special character sign: #
RETURN
N
'ABCDEFGHIJKLMNOPQRSTUVWXYZ 0123456789!#'
;
END
dbo.fnNormalizeDatastring
(@string nvarchar(
max
))
nvarchar(
)
-- Get all valid characters
DECLARE
@validChars nvarchar(50) = (
SELECT
dbo.dbo.fnGetNGramValidChars()
Result);
-- Only capital characters
SET
@string =
UPPER
(@string);
-- Replace umlauts
REPLACE
(@string, N
'Ä'
, N
'AE'
);
'Æ'
-- usw. ...
-- Registered Marks
'℅'
'C#O'
'™'
'TM'
-- Replace delimiter by !
'.'
'!'
','
-- Replace special characeters by hash mark.
'+'
'#'
'-'
-- Replace all other invalid characters by space.
@loop
int
= 1;
@len
= LEN(@string);
@invalidChar
nchar
(1);
WHILE @loop <= @len
@invalidChar =
SUBSTRING
(@string, @loop, 1);
IF CHARINDEX(@invalidChar, @validChars) <= 0
(@string, @invalidChar, N
' '
@loop = @loop + 1;
-- Remove multiple spaces
@goOn tinyint = 1;
@oldLen
WHILE @goOn <> 0
@oldLen = LEN(@string);
IF @oldLen = LEN(@string)
@goOn = 0;
@string;
dbo.fnCalculate4GramIndex
(@fragment
(4))
@validChars
varchar
(50) = (
@
size
= (
LEN(@validChars)
@id
= -2147483648;
-- Calculate position of each char
@pos1
= CHARINDEX(
(@fragment, 1, 1), @validChars);
@pos2
(@fragment, 2, 1), @validChars);
@pos3
(@fragment, 3, 1), @validChars);
@pos4
(@fragment, 4, 1), @validChars);
@id = @id + (@pos1 - 1) +
(@pos2 - 1) * @
+
(@pos3 - 1) * POWER(@
, 2) +
(@pos4 - 1) * POWER(@
, 3);
@id;
dbo.fnBuildIndex4Gram
@ids
TABLE
(id
@string = (
dbo.fnNormalizeDatastring(@string));
@pos
= LEN(@string) - 4;
IF @len <= 0
WHILE @pos <= @len
INSERT
INTO
@ids (id)
dbo.fnCalculate4GramIndex(
(@string, @pos, 4));
@pos = @pos + 1;
-- Small test:
*
FROM
dbo.fnBuildIndex4Gram(N
'Hello World'
IDX;
VIEW
Person.vAddressIndexData
ADR.AddressID,
ADR.AddressLine1 + N
ISNULL
(ADR.AddressLine2 + N
''
) +
ADR.City + N
ADR.PostalCode + N
SPR.StateProvinceCode + N
SPR.
Name
+ N
CR.
(PER.Title + N
(PER.FirstName + N
(PER.MiddleName + N
(PER.LastName + N
(PER.Suffix + N
(EA.EmailAddress + N
((
CAST
(
PHO.PhoneNumber +
Person.PersonPhone
PHO
WHERE
PHO.BusinessEntityID = PER.BusinessEntityID
FOR
XML PATH (
), Type
ColCSV
MAX
))) + N
IndexData
Person.Address
ADR
INNER
JOIN
Person.StateProvince
SPR
ON
ADR.StateProvinceID = SPR.StateProvinceID
Person.CountryRegion
CR
SPR.CountryRegionCode = CR.CountryRegionCode
LEFT
Person.BusinessEntityAddress
BEA
ADR.AddressID = BEA.AddressID
Person.Person
PER
BEA.BusinessEntityID = PER.BusinessEntityID
Person.EmailAddress
EA
PER.BusinessEntityID = EA.BusinessEntityID
AND
PER.EmailPromotion = EA.EmailAddressID;
dbo.Gram4Index(
AddressId
NOT
NULL
,
ID_4Gram
Occurrence
smallint
CONSTRAINT
PK_Gram4Index
PRIMARY
KEY
CLUSTERED
ASC
NONCLUSTERED
INDEX
IDX_Gram4Index_Search
dbo.Gram4Index
(ID_4Gram)
INCLUDE (AddressId, Occurrence);
-- Some additional statistics.
STATISTICS
[STAT_Gram4Index_AddressId]
[dbo].[Gram4Index]
([AddressId]);
[STAT_Gram4Index_Occurrence]
([Occurrence]);
[STAT_Gram4Index_AddressId_Occurrence]
(AddressId, [Occurrence]);
PROCEDURE
dbo.spUpdate4Gram
@addressId
NOCOUNT
-- Delete old index data
DELETE
AddressId = @addressId;
-- Re-create Index for the Address.
(AddressId, ID_4Gram, Occurrence)
ADR.AddressId, NG.ID
ID_NGram4,
COUNT
(*)
CROSS
APPLY
dbo.fnBuildIndex4Gram(ADR.IndexData)
NG
ADR.AddressId = @addressId
GROUP
BY
ADR.AddressId, NG.ID;
addresses
CURSOR
LOCAL
Person.Address;
OPEN
addresses;
FETCH
NEXT
@addressId;
WHILE (@@FETCH_STATUS = 0)
EXEC
dbo.spUpdate4Gram @addressId;
CLOSE
DEALLOCATE
PRINT N
'Finished'
dbo.spSearch4Gram
(@search nvarchar(4000),
@factor
decimal
(9, 4)
-- Normalize search string.
@search = (
dbo.fnNormalizeDatastring(@search));
@len = (LEN(@search) - 4) * @factor;
@len = 1;
-- Weighted 4-Gram index search
WITH
idx
GR4.AddressId,
SUM
(GR4.Occurrence)
Weigth
dbo.fnBuildIndex4Gram(@search)
BIG
GR4
GR4.ID_4Gram = BIG.id
GR4.AddressId
HAVING
(GR4.Occurrence) > @len
ADR.AddressLine1,
ADR.City,
idx.Weigth
(NOLOCK)
ADR.AddressID = idx.AddressId
ORDER
idx.weigth
DESC
-- Test cases
-- Umlaut search => "Buergermeister" as result
dbo.spSearch4Gram N
'Saarland Bürgermeister'
, 0.8;
-- and counter wise
'rotthaeuser germany saarbruecken'
-- City name and telephon number without area code.
'Springfield -555-0181'
, 0.7;
-- Typeo: "Bery" instead "Berry"
'Bery court'
, 0.6;
-- Some part terms
'West gloria California 91791'
, 0.75;
-- Long search text
'5157 Washington 98027 lane marywood Issaquah'
-- Large result set
'Washington'