我有大量的文字,我正在尝试计算其中的所有单词以获取其频率。将字典保存在内存中太大了所以我使用sqlite来完成这项任务。我编写了一个Database类,并将该词用作主键,因为它应该是唯一的。
在这个过程的某个地方,它失败了,因为“频率”这个词试图加两次。我不知道为什么会发生这种情况,因为还有许多其他常用词已经被计算过两次......这是我的两个文件....
dbmaker.py
import sqlite3
import sys
class Database():
'''Creates an object with method for adding and checking words against an sqlite database'''
def __init__(self):
self.dbname = raw_input('What will be the name of this database: ')
self.table1 = 'words'
self.column1 = 'word_name'
self.column1t = 'TEXT'
self.column2 = 'frequency'
self.column2t = 'INTEGER'
self.conn = sqlite3.connect(self.dbname)
self.c = self.conn.cursor()
self.c.execute('CREATE TABLE {tn} ({nf} {ft} PRIMARY KEY)'.\
format(tn=self.table1, nf=self.column1, ft=self.column1t))
self.c.execute('ALTER TABLE {tn} ADD COLUMN {cn} {ct}'.\
format(tn=self.table1, cn=self.column2, ct=self.column2t))
self.conn.commit()
self.checkfile = open('check.txt', 'w+')
def check_word(self, word_name):
exist = self.c.execute('SELECT * FROM {tn} WHERE {cn}="{wn}"'.\
format(tn=self.table1, cn=self.column1, wn=word_name))
exist = self.c.fetchall()
if exist:
new_freq = exist[0][1] + 1
self.c.execute("UPDATE {tn} SET {c2n}={en} WHERE {c1n}='{word}'".\
format(tn=self.table1, c2n=self.column2, en=new_freq, c1n=self.column1, word=word_name))
return True
else:
return False
def add_word(self, word_name, frequency=1):
self.checkfile.write('%s\n' % word_name)
self.c.execute('INSERT INTO {tn} ({c1n}, {c2n}) VALUES ("{wn}", {f})'.\
format(tn=self.table1, c1n=self.column1, c2n=self.column2, wn=word_name, f=frequency))
crunch.py
import sys
import operator
from dbmaker import Database
db = Database()
infile = open(sys.argv[1])
for line in infile:
line = line.split()
for word in line:
word=word.lower()
if not db.check_word(word):
db.add_word(word)
db.conn.commit() db.conn.close()
这是我的错误:
Traceback (most recent call last):
File "crunch.py", line 13, in <module>
db.add_word(word)
File "/home/ubuntu/wikicorpus/dbmaker.py", line 44, in add_word
format(tn=self.table1, c1n=self.column1, c2n=self.column2, wn=word_name, f=frequency))
sqlite3.IntegrityError:UNIQUE约束失败:words.word_name