Wednesday, June 27, 2012

Boolean Mode with weighted relevance in MySQL

Need weighted relevance while doing Boolean Mode in MySQL? Here's script for that.

--BEGIN SCRIPT AREA

 
--OPTIONAL for testing
CREATE TABLE IF NOT EXISTS `test` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`name` varchar(25) NOT NULL,
`text` text,
PRIMARY KEY (`test_id`),
FULLTEXT KEY `text` (`text`,`name`)
) ;

INSERT INTO `test` (`id`, `name`, `text`) VALUES
(1, 'apples', 'Apples oranges apples bananas grapes grapefruit avocados apples '),
(2, 'oranges', 'grapes tomatoes apples apples apples plums pairs'),
(3, 'bananas', 'strawberries tomatoes apples peppers'),
(4, 'grapes', 'grapes grapefruit banana strawberries');
--END OF OPTIONAL



DELIMITER $$

DROP FUNCTION IF EXISTS `simple_weighted_word_count`$$
CREATE FUNCTION `simple_weighted_word_count` (in_fields TEXT, in_word TEXT)
RETURNS DECIMAL(10, 2)
BEGIN

DECLARE count DECIMAL (10, 2);
DECLARE remaining_phrase TEXT;
DECLARE remaining_str TEXT;
DECLARE current_phrase TEXT;
DECLARE current_word TEXT;
DECLARE beginning_phrase TEXT;
DECLARE end_phrase TEXT;
DECLARE first_char VARCHAR(1);
DECLARE last_char VARCHAR(1);

SET count = 0;
SET remaining_phrase = in_word;
SET remaining_str = '';

WHILE remaining_phrase != '' DO
SET beginning_phrase = SUBSTRING_INDEX(remaining_phrase, '"', 1);
SET last_char = SUBSTRING(beginning_phrase, LENGTH(beginning_phrase), 1);
SET end_phrase = SUBSTRING(remaining_phrase, LENGTH(beginning_phrase) + 2);
SET current_phrase = SUBSTRING_INDEX(end_phrase, '"', 1);
SET remaining_phrase = SUBSTRING(remaining_phrase, LENGTH(CONCAT(beginning_phrase, current_phrase)) + 4);
SET remaining_str = CONCAT(remaining_str, ' ', TRIM(beginning_phrase));
IF last_char = '-' THEN
SET count = count;
ELSEIF last_char = '>' THEN
IF current_phrase != '' THEN
SET count = count + (((LENGTH(in_fields) - LENGTH(REPLACE(LOWER(in_fields), current_phrase, '')))/LENGTH(current_phrase)) * 3);
END IF;
ELSEIF last_char = '<' THEN
IF current_phrase != '' THEN
SET count = count + (((LENGTH(in_fields) - LENGTH(REPLACE(LOWER(in_fields), current_phrase, '')))/LENGTH(current_phrase)) * 1);
END IF;
ELSEIF last_char = '~' THEN
IF current_phrase != '' THEN
SET count = count + (((LENGTH(in_fields) - LENGTH(REPLACE(LOWER(in_fields), current_phrase, '')))/LENGTH(current_phrase)) * .5);
END IF;
ELSE
IF current_phrase != '' THEN
SET count = count + (((LENGTH(in_fields) - LENGTH(REPLACE(LOWER(in_fields), current_phrase, '')))/LENGTH(current_phrase)) * 2);
END IF;
END IF;
END WHILE;

WHILE remaining_str != '' DO
SET current_word = TRIM(SUBSTRING_INDEX(remaining_str, ' ', 1));
SET remaining_str = SUBSTRING(remaining_str, LENGTH(current_word) + 2);
SET first_char = SUBSTRING(current_word, 1, 1);
IF first_char = '-' THEN
SET count = count;
ELSEIF first_char = '>' THEN
SET current_word = SUBSTR(current_word, 2);
IF current_word != '' THEN
SET count = count + (((LENGTH(in_fields) - LENGTH(REPLACE(LOWER(in_fields), current_word, '')))/LENGTH(current_word)) * 1.5);
END IF;
ELSEIF first_char = '<' THEN
SET current_word = SUBSTR(current_word, 2);
IF current_word != '' THEN
SET count = count + (((LENGTH(in_fields) - LENGTH(REPLACE(LOWER(in_fields), current_word, '')))/LENGTH(current_word)) * .5);
END IF;
ELSEIF first_char = '~' THEN
SET current_word = SUBSTR(current_word, 2);
IF current_word != '' THEN
SET count = count + (((LENGTH(in_fields) - LENGTH(REPLACE(LOWER(in_fields), current_word, '')))/LENGTH(current_word)) * .25);
END IF;
ELSE
IF current_word != '' THEN
SET count = count + ((LENGTH(in_fields) - LENGTH(REPLACE(LOWER(in_fields), current_word, '')))/LENGTH(current_word));
END IF;
END IF;
END WHILE;

RETURN count;
END$$

DROP FUNCTION IF EXISTS `weighted_word_count`$$
CREATE FUNCTION `weighted_word_count` (in_fields TEXT, in_words TEXT)
RETURNS TEXT
BEGIN
DECLARE count DECIMAL (10, 2);
DECLARE remaining_text TEXT;
DECLARE current_text TEXT;
DECLARE beginning_text TEXT;
DECLARE end_text TEXT;
DECLARE last_char VARCHAR(1);
DECLARE remaining_phrases TEXT;

SET count = 0;
SET remaining_text = in_words;
SET remaining_phrases = '';

WHILE remaining_text != '' DO
SET beginning_text = SUBSTRING_INDEX(remaining_text, '(', 1);
SET last_char = SUBSTRING(beginning_text, LENGTH(beginning_text), 1);
SET end_text = SUBSTRING(remaining_text, LENGTH(beginning_text) + 2);
SET current_text = SUBSTRING_INDEX(end_text, ')', 1);
SET remaining_text = SUBSTRING(remaining_text, LENGTH(CONCAT(beginning_text, current_text)) + 4);
SET remaining_phrases = CONCAT(remaining_text, ' ', TRIM(beginning_text));
IF last_char = '-' THEN
SET count = count;
ELSEIF last_char = '>' THEN
IF current_text != '' THEN
SET count = count + (simple_weighted_word_count(in_fields, current_text) * 1.5);
END IF;
ELSEIF last_char = '<' THEN
IF current_text != '' THEN
SET count = count + (simple_weighted_word_count(in_fields, current_text) * 0.5);
END IF;
ELSEIF last_char = '~' THEN
IF current_text != '' THEN
SET count = count + (simple_weighted_word_count(in_fields, current_text) * 0.25);
END IF;
ELSE
IF current_text != '' THEN
SET count = count + simple_weighted_word_count(in_fields, current_text);
END IF;
END IF;
END WHILE;

IF remaining_phrases != '' THEN
SET count = count + simple_weighted_word_count(in_fields, remaining_phrases);
END IF;

RETURN count;
END$$

DELIMITER ;


--EXAMPLE QUERY.
SELECT *, MATCH(`name`, `text`) AGAINST ('grape* banana*' IN BOOLEAN MODE) as relevance, weighted_word_count(CONCAT(`name`,`text`),'grape banana') as weighted_word_count
FROM `Test`
WHERE MATCH(`name`, `text`) AGAINST ('grape* banana*' IN BOOLEAN MODE) ORDER BY relevance DESC, weighted_word_count DESC;


--END OF SCRIPT AREA

References
1. I totally snatched this from http://www.codingforums.com/archive/index.php/t-176090.html 

No comments:

Post a Comment

There was an error in this gadget