From: Roman Bazalevsky Date: Sat, 5 Apr 2014 11:12:51 +0000 (+0400) Subject: Добавлена процедура поиска и удаления дубликатов. X-Git-Url: https://git.rvb.name/openlib.git/commitdiff_plain/5361042232c8aa9f86c46f8533ef9637dcd2904d Добавлена процедура поиска и удаления дубликатов. --- diff --git a/db.py b/db.py index 84a999d..a39c160 100644 --- a/db.py +++ b/db.py @@ -197,6 +197,25 @@ def ListByFormat(format,limit=100): print "No connection to DB" exit() +def ListDups(limit=100): + if database: + c = database.cursor() + c.execute('SELECT b.title,l.author,max(b.id) id FROM metadata.books b,metadata.books_authors_link l where b.id=l.book group by b.title,l.author having count(*)>%s',(limit)) + return c.fetchall() + else: + print "No connection to DB" + exit() + +def ListByTitleAndAuthor(title,author,id=0): + if database: + c = database.cursor() + c.execute('SELECT b.id FROM metadata.books b,metadata.books_authors_link l where b.id=l.book and b.title=%s and l.author=%s and b.id<>%s',(title,author,id)) + return c.fetchall() + else: + print "No connection to DB" + exit() + + def Commit(): if database: database.commit() diff --git a/fb2_process.py b/fb2_process.py index 38d8876..d8c01c1 100755 --- a/fb2_process.py +++ b/fb2_process.py @@ -279,7 +279,21 @@ def CheckFiles(delete = 0): db.DelBook(id) db.Commit() +def RemoveDups(limit = 100): + if limit<2: + return + id_to_del=set([]) + recs = db.ListDups(limit); + for rec in recs: + ids = db.ListByTitleAndAuthor(rec[0],rec[1],rec[2]) + for id in ids: + id_to_del.add(id) + for id in id_to_del: + print "\r Deleting %s..." % (id) + DelBook(id) + def main(): + print "Processing...\r" ProcessDir(db.tmp_files) CompressAll(2000)