Home » SQL & PL/SQL » SQL & PL/SQL » Jaro-Winkler similarity for 9i
Jaro-Winkler similarity for 9i [message #486569] |
Thu, 16 December 2010 16:09 |
|
Barbara Boehmer
Messages: 9097 Registered: November 2002 Location: California, USA
|
Senior Member |
|
|
I received an email request for a pl/sql function for 9i that would do the same thing as utl_match.jaro_winkler_similarity does in 10g and 11g, so I wrote the following code and test. I thought I would share it here for anyone else who might want it or for anybody to suggest improvements for the code.
CREATE OR REPLACE FUNCTION jws -- Jaro-Winkler similarity
(p_string1 IN VARCHAR2,
p_string2 IN VARCHAR2)
RETURN NUMBER
DETERMINISTIC
AS
v_closeness NUMBER := 0;
v_temp VARCHAR2 (32767);
v_comp1 VARCHAR2 (32767);
v_comp2 VARCHAR2 (32767);
v_matches NUMBER := 0;
v_char VARCHAR2 (1);
v_transpositions NUMBER := 0;
v_d_jaro NUMBER := 0;
v_leading NUMBER := 0;
v_d_winkler NUMBER := 0;
v_jws NUMBER := 0;
BEGIN
-- check for null strings:
IF p_string1 IS NULL OR p_string2 IS NULL THEN
RETURN 0;
END IF;
-- closeness:
v_closeness := (GREATEST (LENGTH (p_string1), LENGTH (p_string2)) / 2) - 1;
-- find matching characters and transpositions within closeness:
v_temp := p_string2;
FOR i IN 1 .. LENGTH (p_string1) LOOP
IF INSTR (v_temp, SUBSTR (p_string1, i, 1)) > 0 THEN
v_char := SUBSTR (p_string1, i, 1);
IF ABS (INSTR (p_string1, v_char) - INSTR (p_string2, v_char)) <= v_closeness THEN
v_comp1 := v_comp1 || SUBSTR (p_string1, i, 1);
v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (p_string1, i, 1)) - 1)
|| SUBSTR (v_temp, INSTR (v_temp, SUBSTR (p_string1, i, 1)) + 1);
END IF;
END IF;
END LOOP;
v_temp := p_string1;
FOR i IN 1 .. LENGTH (p_string2) LOOP
IF INSTR (v_temp, SUBSTR (p_string2, i, 1)) > 0 THEN
v_char := SUBSTR (p_string2, i, 1);
IF ABS (INSTR (p_string2, v_char) - INSTR (p_string1, v_char)) <= v_closeness THEN
v_comp2 := v_comp2 || SUBSTR (p_string2, i, 1);
v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (p_string2, i, 1)) - 1)
|| SUBSTR (v_temp, INSTR (v_temp, SUBSTR (p_string2, i, 1)) + 1);
END IF;
END IF;
END LOOP;
-- check for null strings:
IF v_comp1 IS NULL OR v_comp2 IS NULL THEN
RETURN 0;
END IF;
-- count matches and transpositions within closeness:
FOR i IN 1 .. LEAST (LENGTH (v_comp1), LENGTH (v_comp2)) LOOP
IF SUBSTR (v_comp1, i, 1) = SUBSTR (v_comp2, i, 1) THEN
v_matches := v_matches + 1;
ELSE
v_char := SUBSTR (v_comp1, i, 1);
IF ABS (INSTR (p_string1, v_char) - INSTR (p_string2, v_char)) <= v_closeness THEN
v_transpositions := v_transpositions + 1;
v_matches := v_matches + 1;
END IF;
END IF;
END LOOP;
v_transpositions := v_transpositions / 2;
-- check for no matches:
IF v_matches = 0
THEN RETURN 0;
END IF;
-- Jaro:
v_d_jaro := ((v_matches / LENGTH (p_string1)) +
(v_matches / LENGTH (p_string2)) +
((v_matches - v_transpositions) / v_matches))
/ 3;
-- count matching leading characters (up to 4):
FOR i IN 1 .. LEAST (LENGTH (p_string1), LENGTH (p_string2), 4) LOOP
IF SUBSTR (p_string1, i, 1) = SUBSTR (p_string2, i, 1) THEN
v_leading := v_leading + 1;
ELSE
EXIT;
END IF;
END LOOP;
-- Winkler:
v_d_winkler := v_d_jaro + ((v_leading * .1) * (1 - v_d_jaro));
-- Jaro-Winkler similarity rounded:
v_jws := ROUND (v_d_winkler * 100);
RETURN v_jws;
END jws;
/
SCOTT@orcl_11gR2> WITH
2 strings AS
3 (SELECT NULL string1, NULL string2 FROM DUAL UNION ALL
4 SELECT 'test' string1, NULL string2 FROM DUAL UNION ALL
5 SELECT NULL string1, 'test' string2 FROM DUAL UNION ALL
6 SELECT 'CRATE' string1, 'TRACE' string2 FROM DUAL UNION ALL
7 SELECT 'MARTHA' string1, 'MARHTA' string2 FROM DUAL UNION ALL
8 SELECT 'DWAYNE' string1, 'DUANE' string2 FROM DUAL UNION ALL
9 SELECT 'DIXON' string1, 'DICKSONX' string2 FROM DUAL UNION ALL
10 SELECT 'Dunningham' string1, 'Cunningham' string2 FROM DUAL UNION ALL
11 SELECT 'Abroms' string1, 'Abrams' string2 FROM DUAL UNION ALL
12 SELECT 'Lampley' string1, 'Campley' string2 FROM DUAL UNION ALL
13 SELECT 'Jonathon' string1, 'Jonathan' string2 FROM DUAL UNION ALL
14 SELECT 'Jeraldine' string1, 'Gerladine' string2 FROM DUAL UNION ALL
15 SELECT 'test' string1, 'blank' string2 FROM DUAL UNION ALL
16 SELECT 'everybody' string1, 'every' string2 FROM DUAL UNION ALL
17 SELECT 'a' string1, 'aaa' string2 FROM DUAL)
18 SELECT string1, string2,
19 UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
20 jws (string1, string2) my_jws
21 FROM strings
22 ORDER BY my_jws DESC
23 /
STRING1 STRING2 ORACLE_JWS MY_JWS
---------- ---------- ---------- ----------
MARTHA MARHTA 96 96
Jonathon Jonathan 95 95
Dunningham Cunningham 93 93
Abroms Abrams 92 92
everybody every 91 91
Lampley Campley 90 90
Jeraldine Gerladine 88 88
DWAYNE DUANE 84 84
DIXON DICKSONX 81 81
a aaa 80 80
CRATE TRACE 73 73
test blank 0 0
test 0 0
test 0 0
0 0
15 rows selected.
SCOTT@orcl_11gR2>
|
|
|
Re: Jaro-Winkler similarity for 9i [message #486609 is a reply to message #486569] |
Fri, 17 December 2010 04:58 |
mnitu
Messages: 159 Registered: February 2008 Location: Reims
|
Senior Member |
|
|
Hi Barbara,
Thanks for sharing your function, that's very nice. I have two remarks:
a) It seems that there are some difference between Oracle version and your version concerning accents.
Connected to Oracle Database 10g Enterprise Edition Release 10.2.0.4.0
Connected as mni
SQL>
SQL> WITH strings AS (
2 SELECT 'Géraldine' string1, 'Gerladine' string2 FROM DUAL UNION ALL
3 SELECT 'Jérôme' string1, 'Jerome' string2 FROM DUAL UNION ALL
4 SELECT 'ça' string1, 'ca' string2 FROM DUAL UNION ALL
5 SELECT 'Üwe' string1, 'Uwe' string2 FROM DUAL
6 )
7 SELECT string1, string2,
8 UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
9 jws (string1, string2) my_jws
10 FROM strings
11 ORDER BY my_jws DESC
12 /
STRING1 STRING2 ORACLE_JWS MY_JWS
--------- --------- ---------- ----------
Géraldine Gerladine 89 82
Üwe Uwe 77 78
Jérôme Jerome 80 70
ça ca 66 67
b) Your function, as actually defined, is not really deterministic
Connected to Oracle Database 10g Enterprise Edition Release 10.2.0.4.0
Connected as mni
SQL> alter session set nls_comp = 'LINGUISTIC';
Session altered
SQL> ALTER SESSION SET NLS_SORT=BINARY_AI;
Session altered
SQL>
SQL> WITH strings AS (
2 SELECT 'Géraldine' string1, 'Gerladine' string2 FROM DUAL UNION ALL
3 SELECT 'Jérôme' string1, 'Jerome' string2 FROM DUAL UNION ALL
4 SELECT 'ça' string1, 'ca' string2 FROM DUAL UNION ALL
5 SELECT 'Üwe' string1, 'Uwe' string2 FROM DUAL
6 )
7 SELECT string1, string2,
8 UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
9 jws (string1, string2) my_jws
10 FROM strings
11 ORDER BY my_jws DESC
12 /
STRING1 STRING2 ORACLE_JWS MY_JWS
--------- --------- ---------- ----------
Jérôme Jerome 80 100
Üwe Uwe 77 100
ça ca 66 100
Géraldine Gerladine 89 97
|
|
|
|
Re: Jaro-Winkler similarity for 9i [message #486640 is a reply to message #486616] |
Fri, 17 December 2010 10:43 |
|
Michel Cadot
Messages: 68665 Registered: March 2007 Location: Nanterre, France, http://...
|
Senior Member Account Moderator |
|
|
Maybe a solution to this accented characters is to first convert them to their non-accented equivalent.
I have a TRANSLATE expression I used for a close thing:
translate (val,
'ƒŠŒŽšœžŸÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïøðñòóôõöùúûüýÿÞþ',
'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy')
Regards
Michel
[Updated on: Fri, 17 December 2010 10:45] Report message to a moderator
|
|
|
Re: Jaro-Winkler similarity for 9i [message #486647 is a reply to message #486640] |
Fri, 17 December 2010 12:27 |
|
Barbara Boehmer
Messages: 9097 Registered: November 2002 Location: California, USA
|
Senior Member |
|
|
Michel Cadot wrote on Fri, 17 December 2010 08:43
Maybe a solution to this accented characters is to first convert them to their non-accented equivalent.
I have a TRANSLATE expression I used for a close thing:
translate (val,
'ƒŠŒŽšœžŸÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïøðñòóôõöùúûüýÿÞþ',
'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy')
Regards
Michel
That would work if you want to ignore the accents. Translate could either be applied to the strings before they were passed to the function or used in the function, which would make it truly deterministic. If you wanted to match Oracle, I don't know what sort of weight algorithm they applied to the various accents. Realistically, I think it makes more sense to ignore the accents when comparing names, looking for duplicates. Based on your suggestion, I have provided a revised function and test below. I find it odd that Oracle's utl_match.jaro_winkler_similarity considers "ça" and "ca" a complete non-match, instead of an exact match or even partial match. Also, this is not the results that mnitu got, perhaps due to a difference between his 10g and my 11g.
CREATE OR REPLACE FUNCTION jws -- Jaro-Winkler similarity
(p_string1 IN VARCHAR2,
p_string2 IN VARCHAR2)
RETURN NUMBER
DETERMINISTIC
AS
v_string1 VARCHAR2 (32767);
v_string2 VARCHAR2 (32767);
v_closeness NUMBER := 0;
v_temp VARCHAR2 (32767);
v_comp1 VARCHAR2 (32767);
v_comp2 VARCHAR2 (32767);
v_matches NUMBER := 0;
v_char VARCHAR2 (1);
v_transpositions NUMBER := 0;
v_d_jaro NUMBER := 0;
v_leading NUMBER := 0;
v_d_winkler NUMBER := 0;
v_jws NUMBER := 0;
BEGIN
-- check for null strings:
IF p_string1 IS NULL OR p_string2 IS NULL THEN
RETURN 0;
END IF;
-- remove accents:
v_string1 := translate (p_string1,
'ƒŠŒŽšœžŸÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïøðñòóôõöùúûüýÿÞþ',
'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy');
v_string2 := translate (p_string2,
'ƒŠŒŽšœžŸÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïøðñòóôõöùúûüýÿÞþ',
'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy');
-- closeness:
v_closeness := (GREATEST (LENGTH (v_string1), LENGTH (v_string2)) / 2) - 1;
-- find matching characters and transpositions within closeness:
v_temp := v_string2;
FOR i IN 1 .. LENGTH (v_string1) LOOP
IF INSTR (v_temp, SUBSTR (v_string1, i, 1)) > 0 THEN
v_char := SUBSTR (v_string1, i, 1);
IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN
v_comp1 := v_comp1 || SUBSTR (v_string1, i, 1);
v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string1, i, 1)) - 1)
|| SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string1, i, 1)) + 1);
END IF;
END IF;
END LOOP;
v_temp := v_string1;
FOR i IN 1 .. LENGTH (v_string2) LOOP
IF INSTR (v_temp, SUBSTR (v_string2, i, 1)) > 0 THEN
v_char := SUBSTR (v_string2, i, 1);
IF ABS (INSTR (v_string2, v_char) - INSTR (v_string1, v_char)) <= v_closeness THEN
v_comp2 := v_comp2 || SUBSTR (v_string2, i, 1);
v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string2, i, 1)) - 1)
|| SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string2, i, 1)) + 1);
END IF;
END IF;
END LOOP;
-- check for null strings:
IF v_comp1 IS NULL OR v_comp2 IS NULL THEN
RETURN 0;
END IF;
-- count matches and transpositions within closeness:
FOR i IN 1 .. LEAST (LENGTH (v_comp1), LENGTH (v_comp2)) LOOP
IF SUBSTR (v_comp1, i, 1) = SUBSTR (v_comp2, i, 1) THEN
v_matches := v_matches + 1;
ELSE
v_char := SUBSTR (v_comp1, i, 1);
IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN
v_transpositions := v_transpositions + 1;
v_matches := v_matches + 1;
END IF;
END IF;
END LOOP;
v_transpositions := v_transpositions / 2;
-- check for no matches:
IF v_matches = 0
THEN RETURN 0;
END IF;
-- Jaro:
v_d_jaro := ((v_matches / LENGTH (v_string1)) +
(v_matches / LENGTH (v_string2)) +
((v_matches - v_transpositions) / v_matches))
/ 3;
-- count matching leading characters (up to 4):
FOR i IN 1 .. LEAST (LENGTH (v_string1), LENGTH (v_string2), 4) LOOP
IF SUBSTR (v_string1, i, 1) = SUBSTR (v_string2, i, 1) THEN
v_leading := v_leading + 1;
ELSE
EXIT;
END IF;
END LOOP;
-- Winkler:
v_d_winkler := v_d_jaro + ((v_leading * .1) * (1 - v_d_jaro));
-- Jaro-Winkler similarity rounded:
v_jws := ROUND (v_d_winkler * 100);
RETURN v_jws;
END jws;
/
SCOTT@orcl_11gR2> WITH
2 strings AS
3 (SELECT NULL string1, NULL string2 FROM DUAL UNION ALL
4 SELECT 'test' string1, NULL string2 FROM DUAL UNION ALL
5 SELECT NULL string1, 'test' string2 FROM DUAL UNION ALL
6 SELECT 'CRATE' string1, 'TRACE' string2 FROM DUAL UNION ALL
7 SELECT 'MARTHA' string1, 'MARHTA' string2 FROM DUAL UNION ALL
8 SELECT 'DWAYNE' string1, 'DUANE' string2 FROM DUAL UNION ALL
9 SELECT 'DIXON' string1, 'DICKSONX' string2 FROM DUAL UNION ALL
10 SELECT 'Dunningham' string1, 'Cunningham' string2 FROM DUAL UNION ALL
11 SELECT 'Abroms' string1, 'Abrams' string2 FROM DUAL UNION ALL
12 SELECT 'Lampley' string1, 'Campley' string2 FROM DUAL UNION ALL
13 SELECT 'Jonathon' string1, 'Jonathan' string2 FROM DUAL UNION ALL
14 SELECT 'Jeraldine' string1, 'Gerladine' string2 FROM DUAL UNION ALL
15 SELECT 'test' string1, 'blank' string2 FROM DUAL UNION ALL
16 SELECT 'everybody' string1, 'every' string2 FROM DUAL UNION ALL
17 SELECT 'a' string1, 'aaa' string2 FROM DUAL UNION ALL
18 SELECT 'Géraldine' string1, 'Gerladine' string2 FROM DUAL UNION ALL
19 SELECT 'Jérôme' string1, 'Jerome' string2 FROM DUAL UNION ALL
20 SELECT 'ça' string1, 'ca' string2 FROM DUAL UNION ALL
21 SELECT 'Üwe' string1, 'Uwe' string2 FROM DUAL)
22 SELECT string1, string2,
23 UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
24 jws (string1, string2) my_jws
25 FROM strings
26 ORDER BY my_jws DESC
27 /
STRING1 STRING2 ORACLE_JWS MY_JWS
---------- ---------- ---------- ----------
ça ca 0 100
Jérôme Jerome 74 100
Üwe Uwe 72 100
Géraldine Gerladine 86 97
MARTHA MARHTA 96 96
Jonathon Jonathan 95 95
Dunningham Cunningham 93 93
Abroms Abrams 92 92
everybody every 91 91
Lampley Campley 90 90
Jeraldine Gerladine 88 88
DWAYNE DUANE 84 84
DIXON DICKSONX 81 81
a aaa 80 80
CRATE TRACE 73 73
test blank 0 0
test 0 0
test 0 0
0 0
19 rows selected.
SCOTT@orcl_11gR2>
|
|
|
Re: Jaro-Winkler similarity for 9i [message #689035 is a reply to message #486569] |
Mon, 28 August 2023 13:31 |
|
mathguy
Messages: 108 Registered: January 2023
|
Senior Member |
|
|
Only 13 years later - I hope it's still of interest
A recent discussion on AskTom made me write my own Jaro-Winkler similarity function. Testing against Oracle's version I found that the Oracle implementation is (almost surely) buggy. I discuss it in a reply titled Is there a bug in the Oracle implementation of Jaro-Winkler? in this thread: https://asktom.oracle.com/pls/apex/asktom.search?tag=performace-issue-with-sql-in-the-view-using-utl-matchjaro-winkler-similarity& ;p_session=9590653204898
At least some if not all of the discrepancies you saw between your values and Oracle's results are caused by what I reported there: Oracle has a funny way of calculating "transpositions". I don't know exactly what they do; the errors only seem to exist when the initial count is odd, and then must be divided by 2. In some cases it looks like Oracle truncates the result (which is incorrect - that can be caused either by a misunderstanding of the Jaro definition, or by careless programming in C, dividing by 2 instead of 2.0). But in other cases the value is rounded up - so either there are multiple bugs, or there is only one but it is more complex than simple "integer division", whether accidental or intended.
I found your thread here on OraFAQ doing a search for "Jaro-Winkler accented characters" since I found that my function differs from Oracle's for these too. I can see in a very simple example what Oracle is doing: when computing string length, it counts an accented letter as TWO characters. The Oracle function is written in C; I assume it counts bytes, not characters. Perhaps the Oracle function is only meant to be used for single-byte characters - but if that was the intent, there is no indication of it in the documentation.
I will wait for Chris Saxon to respond on AskTom - then I might write something on the Oracle forum, discussing other aspects of the question from the AskTom thread. I will show my version of the JW function, and also how it could be used in a materialized view (as discussed on AskTom, the Oracle function can't be used in a fast refreshing MV, because somehow it "maintains state" - probably a consequence of using package constants that are not declared as constants.)
|
|
|
Re: Jaro-Winkler similarity for 9i [message #689036 is a reply to message #689035] |
Tue, 29 August 2023 03:26 |
|
Barbara Boehmer
Messages: 9097 Registered: November 2002 Location: California, USA
|
Senior Member |
|
|
There is a lot of interesting stuff in the asktom thread about guessing what is in Oracle's hidden code that causes it to return seemingly wrong results and maintain a state such that a materialized view cannot be fast refreshed on commit.
The original question was about improving the performance of a view that uses jaro_winkler_similarity. I posted a comment and demonstration of using a context index with fuzzy search that uses the domain index to narrow the search results. The jaro_winkler_similarity can them be used on top of that. The scores of both are adjustable.
A long time ago, I wrote my own edit distance function before Oracle had a built-in edit distance. Even after Oracle supplied an edit distance and edit distance similarity, I found myself making modifications to my original, like using the Damerau-Levenshtein distance formula, in which the switching of two adjacent characters is counted as 1 change instead of 2 replacements. I have provided my old code and an example below.
CREATE OR REPLACE FUNCTION dld -- Damerau–Levenshtein distance
(str1 VARCHAR2 DEFAULT NULL,
str2 VARCHAR2 DEFAULT NULL)
RETURN NUMBER DETERMINISTIC
AS
lenStr1 NUMBER := NVL (LENGTH (str1), 0);
lenStr2 NUMBER := NVL (LENGTH (str2), 0);
TYPE mytabtype IS TABLE OF NUMBER INDEX BY BINARY_INTEGER;
TYPE myarray IS TABLE OF mytabtype INDEX BY BINARY_INTEGER;
d myarray;
cost NUMBER := 0;
BEGIN
IF str1 = str2 THEN
RETURN 0;
ELSIF lenStr1 = 0 OR lenStr2 = 0 THEN
RETURN GREATEST (lenStr1, lenStr2);
ELSIF lenStr1 = 1 AND lenStr2 = 1 AND str1 <> str2 THEN
RETURN 1;
ELSE
FOR j in 0 .. lenStr2 LOOP
d (0) (j) := j;
END LOOP;
FOR i in 1 .. lenStr1 LOOP
d (i) (0) := i;
FOR j in 1 .. lenStr2 LOOP
IF SUBSTR (str1, i, 1) = SUBSTR (str2, j, 1) THEN
cost := 0;
ELSE
cost := 1;
END IF;
d (i) (j) := LEAST (
d (i-1) (j) + 1, -- deletion
d (i) (j-1) + 1, -- insertion
d (i-1) (j-1) + cost -- substitution
);
IF i > 1 AND j > 1
AND SUBSTR (str1, i, 1) = SUBSTR (str2, j-1, 1)
AND SUBSTR (str1, i-1, 1) = SUBSTR (str2, j, 1)
THEN
d (i) (j) := LEAST (
d (i) (j),
d (i-2) (J-2) + cost -- transposition
);
END IF;
END LOOP;
END LOOP;
RETURN d (lenStr1) (lenStr2);
END IF;
END dld;
/
C##SCOTT@XE_21.3.0.0.0> SELECT UTL_MATCH.EDIT_DISTANCE ('lion', 'loin'),
2 dld ('lion', 'loin')
3 FROM DUAL
4 /
UTL_MATCH.EDIT_DISTANCE('LION','LOIN') DLD('LION','LOIN')
-------------------------------------- ------------------
2 1
1 row selected.
[Updated on: Tue, 29 August 2023 03:54] Report message to a moderator
|
|
|
Re: Jaro-Winkler similarity for 9i [message #689057 is a reply to message #486569] |
Sat, 02 September 2023 13:07 |
|
mathguy
Messages: 108 Registered: January 2023
|
Senior Member |
|
|
Hi Barbara,
I took a look at your function from 2010, and unfortunately it has a bug. I found it by comparing my results to yours, seeing that they disagree in some cases, and then I read your code and found the problem.
Let me illustrate, and then I will explain the bug in detail.
select str1, str2,
utl_match.jaro_winkler_similarity(str1, str2) as orcl_jws,
jws(str1,str2) as bb_jws
from (select 'ab' as str1, 'bb'as str2 from dual)
;
STR1 STR2 ORCL_JWS BB_JWS
---- ---- ---------- ----------
ab bb 66 0
I am comparing two strings, 'ab' and 'bb'. Both have length 2; the "match distance" is trunc( greatest(2, 2) / 2) - 1 which equals 0. That is, characters match only if they are in the same position in both strings.
The letter 'b' in second position in both strings is a match. There are no transpositions (there can't be any, when there is exactly one match). The first character is different, so there is no Winkler correction to the Jaro calculation.
The Jaro similarity is calculated as follows: both lengths are 2, there is one match, and no transpositions; so...
(1/2 + 1/2 + (1-0)/1) / 3 = 2/3 or 0.66666...
or, "normalized" to a score between 1 and 100, it's 66.66666, truncated to 66. Oracle's calculation is correct in this case.
Your calculation shows 0, because the part of the code that identifies matches doesn't follow the correct algorithm.
The Jaro algorithm is like this: First, since we will need to keep track of characters already found in prior matches, we will need to flag characters as "matched" or "unmatched". Initially all characters are marked "unmatched". Then: for each character in the first string, the first step is to identify the "matching window" - from P - M to P + M, where P is the position of the character in the first string and M is the maximum matching distance (adjusted so that P - M is at least 0, and P + M is at most the length of the second string). THEN - and ONLY THEN - we look for the first unmarked character in the "matching window", that equals the character at position P in the first string.
Your function does something different: It takes a character from the first string, then it looks for the first "unmatched" character in the second string, that equals the character from the first string (if any), and ONLY THEN it checks to see if it is in the "matching window".
The fact that the two methods are not equivalent is illustrated by the example I gave. When comparing 'ab' with 'bb', the 'a' in the first string doesn't match anything in the second string. Then the 'b' in the first string is considered.
In the correct Jaro computation, the maximum "matching distance" is 0. The window to search in the second string is just the same position as in the first string. So we look at the character in second position in the second string. It's 'b', which is a match.
In your approach, when we inspect the second character of 'ab' (the character 'b'), there is nothing marked "matched" in the second string yet. We find the first matching character in the second string, WITH NO REGARD TO THE MATCHING WINDOW. That character is the FIRST 'b' in the second string. Then we compare positions and find that this isn't a match because the position in the second string doesn't fall in the "matching window". At this point your function doesn't look for another match, that may be IN the matching window. So it counts no matches at all between the two strings.
Your function does other things I didn't quite understand (and I think aren't right), but I didn't look in detail. You don't seem to keep track of which characters were already "used up" in earlier matches, in a SINGLE loop, over the first string - as the Jaro definition requires. Instead, you find all the "matches" (according to your definition) for the first string, and then separately all the "matches" for the second string. This has no equivalent in the Jaro definition. In particular, this is not equivalent to finding the matching PAIRS, on which transpositions are calculated.
Your function will give the correct results, I believe, if the strings don't have repeated characters, or if there are repeated characters but they are in the same order compared to other matching pairs. It will fail on examples like 'abca' and 'bcaa' (it does fail in that example, where the Oracle result is correct):
select str1, str2,
utl_match.jaro_winkler_similarity(str1, str2) as orcl_jws,
jws(str1,str2) as bb_jws
from (select 'abca' as str1, 'bcaa'as str2 from dual)
;
STR1 STR2 ORCL_JWS BB_JWS
---- ---- ---------- ----------
abca bcaa 83 67
AS AN ASIDE (unrelated in any way to the bug):
Consider the case of strings of length 1. Like 'a' and 'a'. Any normal-brained person would consider those as a "perfect match", with a similarity score of 1.000 (or "100"). However, a strict interpretation of the Jaro definition should find no match! That is because GREATEST(LENGTH_1, LENGTH_2) is 1; then TRUNC(1/2) - 1 equals -1, so the "matching window" is always empty! Indeed, the "distance" between two positions is never "less than or equal to -1" (it is always non-negative).
Oracle, when they implemented JW in the UTL_MATCH function, decided to ignore the Jaro definition in this respect; they calculate 100 as the JW similarity. Your function calculates 0 - but I don't consider that as a bug in your code. Rather, it is a bug in Jaro's own definition. The "maximum matching distance" should be defined as GREATEST(0, JARO_DEFINITION) where the latter is Jaro's definition, in order to work as expected for this special case.
Of course, in real life people would rarely consider the "similarity" of one-character strings - although if we are matching addresses component by component and something like "apartment" can be '3' or '5', then we do consider one-character strings. By contrast, the bug I described earlier seems more general.
|
|
|
Re: Jaro-Winkler similarity for 9i [message #689058 is a reply to message #689057] |
Sat, 02 September 2023 14:02 |
|
Barbara Boehmer
Messages: 9097 Registered: November 2002 Location: California, USA
|
Senior Member |
|
|
Hi mathguy,
I stumbled upon the following that I think you will find interesting and may shed some light on the discrepancies between Oracle's function and the standard definition that seem like they might be bugs.
https://docs.oracle.com/cd/E19182-01/821-0919/gfjtr/index.html
There are multiple links to other pages and links within those pages.
It was a lot of years ago that I created that function and I remember it took me a long time to think through it.
At this point, I believe:
The formula that Oracle uses is not visible and the name may be misleading as it actually incorporates other methods.
Oracle's function may or may not have bugs.
My function may or may not have bugs and at this point I would not know what to compare it to, in order to fix it. I don't feel like messing with it, but if you want to fix mine or create your own, please go ahead and share the result.
I miss the old days when asktom meant ask Tom Kyte and not ask the Oracle masters. Tom used to get to the bottom of things and either explain in detail and demonstrate or file a bug report.
Regards,
Barbara
[Updated on: Sat, 02 September 2023 14:05] Report message to a moderator
|
|
|
Re: Jaro-Winkler similarity for 9i [message #689059 is a reply to message #689057] |
Sat, 02 September 2023 14:22 |
|
mathguy
Messages: 108 Registered: January 2023
|
Senior Member |
|
|
For what it's worth, here is my implementation of the Jaro-Winkler similarity function.
UTL_MATCH has a "Jaro-Winkler" function and a "Jaro-Winkler similarity" function. The latter takes the return value from the former, it multiplies by 100 and truncates the result. This is NOT the theoretical definition of the "similarity" function, it's Oracle's terminology. The theoretical similarity function is the FIRST Oracle function, the one that returns a decimal value between 0 and 1 (not an integer between 1 and 100). My function computes "Jaro-Winkler similarity" as a decimal value between 0 and 1, and should be compared to UTL_MATCH.JARO_WINKLER (not JARO_WINKLER_SIMILARITY).
In my implementation, I define "maximum matching distance" as GREATEST(JMD, 0) where JMD is the "maximum matching distance" defined by Jaro himself. This is to correct for a mistake (I believe) in Jaro's definition; namely, when both strings have length 1, Jaro's definition of "maximum matching distance" is -1, which makes no sense. The similarity of a one-character string with itself should be 1, not 0. Oracle did the same in their implementation.
Note about NULL: Oracle views NULL and the empty string as the same, so we must make a choice. In a sane system (which Oracle DB is not), the similarity function should return NULL if either string is NULL, it should return 1 if both strings are the empty string (seen as different from NULL), and it should return 0 if one string is the empty string and the other is a non-NULL, non-empty string. Here I followed the UTL_MATCH lead: if either string is NULL (or "empty"), the function returns 0. It doesn't return NULL in that case; and it doesn't return 1 even if both strings are NULL ("empty").
The code is written for - and tested in - Oracle 12; for example I use PRAGMA UDF (to optimize the function for being called from SQL), which didn't exist before Oracle 12. However, it can be easily modified to work for earlier versions.
For the Winkler adjustment, I hard-coded the maximum prefix length as 4 and the prefix scaling factor as 0.1 (as in the original Winkler paper, and as Oracle has done). These can be added as function parameters - and the code modified as needed - if it matters. If that is done, error handling must be added to check that the product of the two is less than or equal to 1.
The function works for ASCII text - and, moreover, I only tested on my system, where the character set and encoding uses single bytes for ASCII characters. I don't know how (or even if) this would work with UTF-16 encoding, for example. This has to do, in particular, with accented characters and such.
Here goes:
create or replace function jw (str1 varchar2, str2 varchar2)
return number
authid definer
deterministic
as
pragma udf;
-- js will hold the Jaro similarity; d will hold the return value
js number;
d number := 0;
-- variables to hold matches, transpositions, and init agreement
-- my "transpositions" number is the actual count, not divided by 2;
-- I divide by 2 in the final formula
matches number;
transpositions number;
init_agreement number;
-- We will use two arrays of boolean to track matching
type arr_of_boolean is varray(4000) of boolean;
b1 arr_of_boolean;
b2 arr_of_boolean;
-- len1 and len2 will hold the length of the two input strings
-- lenm is the greater of the two lengths
-- lenw is used for the Winkler correction
-- max_distance is the maximum distance for matching in the Jaro definition
len1 number;
len2 number;
lenm number;
lenw number;
max_distance number;
-- r_start and r_end mark the allowed range of matches in str2
r_start number;
r_end number;
-- loop variables
k number;
begin
-- if str1 or str2 is null, the function will return 0
if str1 is not null and str2 is not null then
-- collect the various length and related values
len1 := length(str1);
len2 := length(str2);
lenm := greatest(len1, len2);
lenw := least(len1, len2, 4);
-- max distance for matches (watch out for lenm = 1!)
-- THIS IS DIFFERENT FROM JARO DEFINITION by adding GREATEST(..., 0)
max_distance := greatest(trunc(lenm/2) - 1, 0);
-- define the arrays of booleans
-- elements are initialized with null, which is OK for us
-- a value changes to TRUE when the corresponding character appears in a match
b1 := arr_of_boolean();
b2 := arr_of_boolean();
b1.extend(len1);
b2.extend(len2);
-- find the matches
matches := 0;
for i in 1 .. len1 loop
r_start := greatest(1, i - max_distance);
r_end := least(len2, i + max_distance);
for k in r_start .. r_end loop
if b2(k) then
continue;
end if;
if substr(str1, i, 1) != substr(str2, k, 1) then
continue;
end if;
b1(i) := true;
b2(k) := true;
matches := matches + 1;
exit;
end loop;
end loop;
-- Continue only if matches > 0; else d = 0 will be returned
if matches > 0 then
-- Compute transpositions
transpositions := 0;
k := 1;
for i in 1 .. len1 loop
if b1(i) is null then
continue;
end if;
while b2(k) is null loop
k := k + 1;
end loop;
if substr(str1, i, 1) != substr(str2, k, 1) then
transpositions := transpositions + 1;
end if;
k := k + 1;
end loop;
-- Compute Jaro similarity
js := (matches * (1/len1 + 1/len2) + 1 - transpositions/(2 * matches)) / 3;
-- Compute initial agreement for Winkler correction
init_agreement := 0;
k := 1;
while k <= lenw and substr(str1, k, 1) = substr(str2, k, 1) loop
init_agreement := init_agreement + 1;
k := k + 1;
end loop;
-- Compute Jaro-Winkler similarity
-- Assumes l = 4 and p = 0.1 as in Winkler's original paper
d := js + init_agreement * 0.1 * (1 - js);
end if;
end if;
return d;
end;
|
|
|
Re: Jaro-Winkler similarity for 9i [message #689060 is a reply to message #689059] |
Sat, 02 September 2023 15:09 |
|
Barbara Boehmer
Messages: 9097 Registered: November 2002 Location: California, USA
|
Senior Member |
|
|
Here is my test, removing the diacritical marks and multiplying your result by 100 and rounding it for comparison. It seems to perfectly match Oracle, so if that was the goal then you succeeded, at least with this original dataset.
C##SCOTT@XE_21.3.0.0.0> create or replace function jw (str1 varchar2, str2 varchar2)
2 -- author of function: mathguy
3 -- from OraFAQ thread: http://www.orafaq.com/forum/mv/msg/164224/689059/#msg_689059
4 return number
5 authid definer
6 deterministic
7 as
8 pragma udf;
9
10 -- js will hold the Jaro similarity; d will hold the return value
11 js number;
12 d number := 0;
13
14 -- variables to hold matches, transpositions, and init agreement
15 -- my "transpositions" number is the actual count, not divided by 2;
16 -- I divide by 2 in the final formula
17 matches number;
18 transpositions number;
19 init_agreement number;
20
21 -- We will use two arrays of boolean to track matching
22 type arr_of_boolean is varray(4000) of boolean;
23 b1 arr_of_boolean;
24 b2 arr_of_boolean;
25
26 -- len1 and len2 will hold the length of the two input strings
27 -- lenm is the greater of the two lengths
28 -- lenw is used for the Winkler correction
29 -- max_distance is the maximum distance for matching in the Jaro definition
30 len1 number;
31 len2 number;
32 lenm number;
33 lenw number;
34 max_distance number;
35
36 -- r_start and r_end mark the allowed range of matches in str2
37 r_start number;
38 r_end number;
39
40 -- loop variables
41 k number;
42 begin
43 -- if str1 or str2 is null, the function will return 0
44 if str1 is not null and str2 is not null then
45
46 -- collect the various length and related values
47 len1 := length(str1);
48 len2 := length(str2);
49 lenm := greatest(len1, len2);
50 lenw := least(len1, len2, 4);
51
52 -- max distance for matches (watch out for lenm = 1!)
53 -- THIS IS DIFFERENT FROM JARO DEFINITION by adding GREATEST(..., 0)
54 max_distance := greatest(trunc(lenm/2) - 1, 0);
55
56 -- define the arrays of booleans
57 -- elements are initialized with null, which is OK for us
58 -- a value changes to TRUE when the corresponding character appears in a match
59 b1 := arr_of_boolean();
60 b2 := arr_of_boolean();
61 b1.extend(len1);
62 b2.extend(len2);
63
64 -- find the matches
65 matches := 0;
66 for i in 1 .. len1 loop
67 r_start := greatest(1, i - max_distance);
68 r_end := least(len2, i + max_distance);
69 for k in r_start .. r_end loop
70 if b2(k) then
71 continue;
72 end if;
73 if substr(str1, i, 1) != substr(str2, k, 1) then
74 continue;
75 end if;
76 b1(i) := true;
77 b2(k) := true;
78 matches := matches + 1;
79 exit;
80 end loop;
81 end loop;
82
83 -- Continue only if matches > 0; else d = 0 will be returned
84
85 if matches > 0 then
86 -- Compute transpositions
87 transpositions := 0;
88 k := 1;
89 for i in 1 .. len1 loop
90 if b1(i) is null then
91 continue;
92 end if;
93 while b2(k) is null loop
94 k := k + 1;
95 end loop;
96 if substr(str1, i, 1) != substr(str2, k, 1) then
97 transpositions := transpositions + 1;
98 end if;
99 k := k + 1;
100 end loop;
101
102 -- Compute Jaro similarity
103 js := (matches * (1/len1 + 1/len2) + 1 - transpositions/(2 * matches)) / 3;
104
105 -- Compute initial agreement for Winkler correction
106 init_agreement := 0;
107 k := 1;
108 while k <= lenw and substr(str1, k, 1) = substr(str2, k, 1) loop
109 init_agreement := init_agreement + 1;
110 k := k + 1;
111 end loop;
112
113 -- Compute Jaro-Winkler similarity
114 -- Assumes l = 4 and p = 0.1 as in Winkler's original paper
115 d := js + init_agreement * 0.1 * (1 - js);
116 end if;
117 end if;
118
119 return d;
120 end;
121 /
Function created.
C##SCOTT@XE_21.3.0.0.0> SHOW ERRORS
No errors.
C##SCOTT@XE_21.3.0.0.0> WITH
2 strings AS
3 (SELECT NULL string1, NULL string2 FROM DUAL UNION ALL
4 SELECT 'test' string1, NULL string2 FROM DUAL UNION ALL
5 SELECT NULL string1, 'test' string2 FROM DUAL UNION ALL
6 SELECT 'CRATE' string1, 'TRACE' string2 FROM DUAL UNION ALL
7 SELECT 'MARTHA' string1, 'MARHTA' string2 FROM DUAL UNION ALL
8 SELECT 'DWAYNE' string1, 'DUANE' string2 FROM DUAL UNION ALL
9 SELECT 'DIXON' string1, 'DICKSONX' string2 FROM DUAL UNION ALL
10 SELECT 'Dunningham' string1, 'Cunningham' string2 FROM DUAL UNION ALL
11 SELECT 'Abroms' string1, 'Abrams' string2 FROM DUAL UNION ALL
12 SELECT 'Lampley' string1, 'Campley' string2 FROM DUAL UNION ALL
13 SELECT 'Jonathon' string1, 'Jonathan' string2 FROM DUAL UNION ALL
14 SELECT 'Jeraldine' string1, 'Gerladine' string2 FROM DUAL UNION ALL
15 SELECT 'test' string1, 'blank' string2 FROM DUAL UNION ALL
16 SELECT 'everybody' string1, 'every' string2 FROM DUAL UNION ALL
17 SELECT 'a' string1, 'aaa' string2 FROM DUAL UNION ALL
18 SELECT 'Geraldine' string1, 'Gerladine' string2 FROM DUAL UNION ALL
19 SELECT 'Jerome' string1, 'Jerome' string2 FROM DUAL UNION ALL
20 SELECT 'ca' string1, 'ca' string2 FROM DUAL UNION ALL
21 SELECT 'Uwe' string1, 'Uwe' string2 FROM DUAL)
22 SELECT string1, string2,
23 UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
24 round (jw (string1, string2) * 100) mathguy_jws -- multiplied by 100 and rounded for comparison
25 FROM strings
26 ORDER BY mathguy_jws DESC
27 /
STRING1 STRING2 ORACLE_JWS MATHGUY_JWS
---------- ---------- ---------- -----------
ca ca 100 100
Jerome Jerome 100 100
Uwe Uwe 100 100
Geraldine Gerladine 97 97
MARTHA MARHTA 96 96
Jonathon Jonathan 95 95
Dunningham Cunningham 93 93
Abroms Abrams 92 92
everybody every 91 91
Lampley Campley 90 90
Jeraldine Gerladine 88 88
DWAYNE DUANE 84 84
DIXON DICKSONX 81 81
a aaa 80 80
CRATE TRACE 73 73
test blank 0 0
test 0 0
test 0 0
0 0
19 rows selected.
|
|
|
Goto Forum:
Current Time: Sat Jun 29 08:40:47 CDT 2024
|