MySql:计算单词在列中出现的次数

时间:2011-04-13 13:04:27

标签: mysql

例如,如果我在这样的列中有数据

data
I love book
I love apple
I love book
I hate apple
I hate apple

我怎样才能得到这样的结果

I = 5
love = 3
hate = 2
book = 2
apple = 3

我们能用MySQL实现吗?

5 个答案:

答案 0 :(得分:4)

以下是仅使用查询的解决方案:

SELECT SUM(total_count) as total, value
FROM (

SELECT count(*) AS total_count, REPLACE(REPLACE(REPLACE(x.value,'?',''),'.',''),'!','') as value
FROM (
SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(t.sentence, ' ', n.n), ' ', -1) value
  FROM table_name t CROSS JOIN 
(
   SELECT a.N + b.N * 10 + 1 n
     FROM 
    (SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) a
   ,(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) b
    ORDER BY n
) n
 WHERE n.n <= 1 + (LENGTH(t.sentence) - LENGTH(REPLACE(t.sentence, ' ', '')))
 ORDER BY value

) AS x
GROUP BY x.value

) AS y
GROUP BY value

以下是完整的工作小提琴:http://sqlfiddle.com/#!2/17481a/1

首先,我们通过@peterm进行查询以提取所有单词here(如果要自定义处理的单词总数,请按照他的说明操作)。然后我们将其转换为子查询,然后我们COUNTGROUP BY每个单词的值,然后在GROUP BY之上进行另一个查询,而不是分组的单词可能存在迹象。即:你好=你好!使用REPLACE

答案 1 :(得分:3)

如果你想进行这种文本分析,我建议使用像lucene这样的东西来获取文档中每个术语的termcount。

答案 2 :(得分:1)

如果您的表格大小合适,此查询将需要很长时间才能运行。最好在单独的表中跟踪计数并在插入值时更新该表,或者如果不需要实时结果,则每隔一段时间仅运行此查询以更新计数表并从中提取数据它。这样,你就不会花费几分钟来从这个复杂的查询中获取数据。

到目前为止,这是I've for you。这是一个好的开始。您唯一需要做的就是修改它以迭代每行中的单词。您可以使用游标或子查询。

创建测试表:

create table tbl(str varchar(100) );
insert into tbl values('data');
insert into tbl values('I love book');
insert into tbl values('I love apple');
insert into tbl values('I love book');
insert into tbl values('I hate apple');
insert into tbl values('I hate apple');

从测试表中提取数据:

SELECT DISTINCT str AS Word, COUNT(str) AS Frequency FROM tbl GROUP BY str;

答案 3 :(得分:1)

创建这样的用户定义函数并在查询中使用它

DELIMITER $$

CREATE FUNCTION `getCount`(myStr VARCHAR(1000), myword VARCHAR(100))
    RETURNS INT
    BEGIN
    DECLARE cnt INT DEFAULT 0;
    DECLARE result INT DEFAULT 1;

    WHILE (result > 0) DO
    SET result = INSTR(myStr, myword);
    IF(result > 0) THEN 
        SET cnt = cnt + 1;
        SET myStr = SUBSTRING(myStr, result + LENGTH(myword));
    END IF;
    END WHILE;
    RETURN cnt;    

    END$$

DELIMITER ;

希望它有所帮助 Refer This

答案 4 :(得分:0)

分裂串程序不是我的工作。你可以在这里找到它

http://forge.mysql.com/tools/tool.php?id=4

我写了剩下的代码。

drop table if exists mytable;
create table mytable (
id int not null auto_increment primary key,
mytext varchar(1000)
) engine = myisam;

insert into mytable (mytext)
values ('I love book,but book sucks!What do you,think   about it? me too'),('I love apple! it rulez.,No, it sucks a lot!!!'),('I love book'),('I hate apple!!! Me too.,!'),('I hate apple');

drop table if exists mywords;
create table mywords (
id int not null auto_increment primary key,
word varchar(50)
) engine = myisam;


delimiter //
drop procedure if exists split_string //
create procedure split_string (
    in input text
    , in `delimiter` varchar(10) 
) 
sql security invoker
begin
    declare cur_position int default 1 ;
    declare remainder text;
    declare cur_string varchar(1000);
    declare delimiter_length tinyint unsigned;

    drop temporary table if exists SplitValues;
    create temporary table SplitValues (
        value varchar(1000) not null 
    ) engine=myisam;

    set remainder = input;
    set delimiter_length = char_length(delimiter);

    while char_length(remainder) > 0 and cur_position > 0 do
        set cur_position = instr(remainder, `delimiter`);
        if cur_position = 0 then
            set cur_string = remainder;
        else
            set cur_string = left(remainder, cur_position - 1);
        end if;
        if trim(cur_string) != '' then
            insert into SplitValues values (cur_string);
        end if;
        set remainder = substring(remainder, cur_position + delimiter_length);
    end while;

end //
delimiter ;


delimiter // 
drop procedure if exists single_words//
create procedure single_words()
begin
declare finish int default 0;
declare str varchar(200);
declare cur_table cursor for  select replace(replace(replace(replace(mytext,'!',' '),',',' '),'.',' '),'?',' ') from mytable;
declare continue handler for not found set finish = 1;
truncate table mywords;
open cur_table;
my_loop:loop
fetch cur_table into str;
if finish = 1 then
leave my_loop;
end if;
call split_string(str,' ');
insert into mywords (word) select * from splitvalues;
end loop;
close cur_table;
end;//
delimiter ;

call single_words();

select word,count(*) as word_count 
from mywords
group by word;

+-------+------------+
| word  | word_count |
+-------+------------+
| a     |          1 |
| about |          1 |
| apple |          3 |
| book  |          3 |
| but   |          1 |
| do    |          1 |
| hate  |          2 |
| I     |          5 |
| it    |          3 |
| lot   |          1 |
| love  |          3 |
| me    |          2 |
| No    |          1 |
| rulez |          1 |
| sucks |          2 |
| think |          1 |
| too   |          2 |
| What  |          1 |
| you   |          1 |
+-------+------------+
19 rows in set (0.00 sec)

必须改进代码才能考虑任何标点符号,但这是一般的想法。