Добавлена процедура поиска и удаления дубликатов.
[openlib.git] / fb2_process.py
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3
4 import re
5 import db
6 import fb2_meta
7 import os
8 import sys
9 import shutil
10
11 mapping = None
12
13 def GetTagsMapping(db):
14   global mapping
15   c = db.cursor()
16   c.execute("select tag_mask,tag_result from metadata.tags_mapping")
17   mapping = c.fetchall()
18   result = []
19   for item in mapping:
20     result.append((re.compile(item[0]),item[1].encode('utf-8')))
21   mapping = result
22   
23 def Matches(tag):
24   global mapping
25   for item in mapping:
26     if item[0].match(tag):
27       return item[1]
28   return tag
29
30 def NewTags(tags):
31   global mapping
32   if not mapping:
33     GetTagsMapping(db.database)
34   new_tags = set()
35   for item in tags:
36     new_tags.add(Matches(item))
37   return list(new_tags)
38
39 class MetaData:
40
41   def GetPath(self):
42     self.sort_title = db.SortName(self.title).replace(' ','_');
43     self.sort_author = db.SortAuthorName(self.author).replace(' ','_');
44     self.path = ("%s/%s/%s/%s/%s (%d)" % (self.sort_author[0],self.sort_author[0:2],self.sort_author[0:4],self.sort_author[0:32],self.sort_title[0:64],self.book_id))
45     self.dataname = (self.title.decode('utf-8')[0:64]+' '+self.author.decode('utf-8')[0:32]).replace('/','')
46
47   def __init__(self,meta_dict,size):
48
49     self.size = size
50     try:
51       tags=NewTags(meta_dict['tags'])
52     except:
53       tags=['other']  
54     if 'trash' in tags:
55       self.state="trash"
56       return
57     self.tags=tags
58     try:
59       tag=tags[0]
60     except:
61       tag='Жанр неизвестен'  
62
63     try:
64       self.author=meta_dict['authors'][0].encode('utf-8')
65       self.authors = meta_dict['authors']
66       author_ids = set()
67       try:
68         for author in self.authors:
69           author_ids.add(db.GetOrCreateAuthor(author.encode('utf-8')))
70       except:
71         pass    
72     except:
73       self.author='Неизвестный Автор (%s)' % (tag)
74       self.authors = []
75       author_ids = []
76
77     try:
78       try:
79         self.langs=meta_dict['languages']    
80       except:
81         self.langs=['ru']  
82       lang_ids = set()
83       for lang in meta_dict['languages']:
84         lang_ids.add(db.GetOrCreateLang(lang.encode('utf-8')))
85     except:
86       pass
87       
88     
89     try:
90       self.publisher = meta_dict['publisher'].encode('utf-8')
91       pub_id=db.GetOrCreatePublisher(self.publisher)
92     except:
93       pub_id=None
94  
95     try:
96       title = meta_dict['book_title'].encode('utf-8')
97     except:
98       title='Название неизвестно'
99     self.title=title
100
101     try:
102       pub_date=meta_dict['pubdate']
103     except:
104       pub_date=None
105     self.pub_date=pub_date
106
107     try:
108       isbn=meta_dict['isbn'].encode('utf-8')
109     except:
110       isbn=None
111     self.isbn=isbn
112
113     try:
114       self.series = meta_dict['series'].encode('utf-8')
115       ser_id=db.GetOrCreateSeries(meta_dict['series'])
116     except:
117       ser_id=None
118     try:
119       self.series_idx = meta_dict['series_index']
120       ser_num=meta_dict['series_index']
121     except:
122       ser_num=None
123
124     tag_ids = set()
125     try:
126       for tag in tags:
127         tag_ids.add(db.GetOrCreateTag(tag))
128     except:
129       pass
130       
131     try:
132       self.cover=meta_dict['cover']
133       self.has_cover=1
134     except:
135       self.has_cover=0        
136
137     try:
138       self.comments=meta_dict['comments'].encode('utf-8')
139     except:
140       self.comments=''  
141
142     book_id = db.CreateBook(title,pub_date,ser_num,isbn)
143     self.book_id = book_id
144    
145     db.LinkBookToAuthors(book_id,author_ids);
146     db.LinkBookToLangs(book_id,lang_ids);
147     if pub_id:
148       db.LinkBookToPublishers(book_id,pub_id);
149     if ser_id:
150       db.LinkBookToSeries(book_id,ser_id);
151     db.LinkBookToTags(book_id,tag_ids);
152     if self.comments:
153       db.StoreComment(book_id,self.comments)
154   
155     self.GetPath()
156     db.SetPath(self.book_id,self.path,self.dataname,self.size,self.has_cover);
157     self.state="done"
158
159 def ProcessFile(filename):
160
161   size = os.path.getsize(filename)
162   stream = open(filename)
163   meta = fb2_meta.get_metadata(stream)
164   stream.close()
165
166   try:
167     book = MetaData(meta.__dict__,size) 
168
169     if book.state=="done":
170
171       new_file_path = db.file_root + book.path + '/' + book.dataname + '.fb2'
172       cover_path = db.file_root + book.path + '/cover.jpg'
173       new_dir_path = db.file_root + book.path 
174
175       os.makedirs(new_dir_path,0755)
176       shutil.move(filename,new_file_path)
177
178       if book.has_cover:
179         cover_path = new_dir_path + '/cover.jpg'
180         print "Book has cover, try to store to "+cover_path
181         coverfile = open(cover_path,'w')
182         coverfile.write(book.cover.decode('base64'))
183         coverfile.close()
184
185       db.Commit()
186       print "Moved to "+new_dir_path
187
188     elif book.state=="trash":
189    
190       print "Tags blacklisted, trashing"
191       os.remove(filename) 
192     
193     else: 
194     
195       shutil.move(filename,db.failed_files+os.path.basename(filename))
196       print "Moved to failed_files"
197       db.Rollback()  
198     
199   except:
200
201     shutil.move(filename,db.failed_files+os.path.basename(filename))
202     print "Moved to failed_files"
203     db.Rollback()  
204
205 def ProcessDir(dirname):
206   for file in os.listdir(dirname):
207     if file.endswith(".fb2"):
208       print "Processing "+file
209       ProcessFile(os.path.join(dirname,file))
210
211 def DelBook(id):
212   path = os.path.join(db.file_root,db.PathByID(id))
213   if path:
214     for file in os.listdir(path):
215       os.remove(os.path.join(path,file))
216     db.DelBook(id)
217     os.rmdir(path)
218     db.Commit()  
219
220 def CompressBook(id):
221   path=db.PathByID(id)
222   if path:
223     datafile = os.path.join(db.file_root,path,db.DataByID(id,'FB2'))
224     datapath = datafile.replace("\"","\\\"")
225     datapath=datapath.replace("`","\`")
226     datapath=datapath.replace("$","\$")
227     zipfile = datapath + '.zip'
228     command = "zip --move --junk-paths \"%s\" \"%s\"" % (zipfile,datapath)
229     command = command.encode('utf-8')
230     print command
231     if os.system(command)==0:
232       db.ChangeBookFormat(id,'FB2','FB2.ZIP')
233       db.Commit()
234
235 def UnCompressBook(id):
236   path=db.PathByID(id)
237   if path:
238     datafile = os.path.join(db.file_root,path,db.DataByID(id,'FB2.ZIP'))
239     datapath = datafile.replace("\"","\\\"")
240     datapath=datapath.replace("`","\`")
241     datapath=datapath.replace("$","\$")
242     command = "unzip  \"%s\" -d \"%s\"" % (datapath,os.path.join(db.file_root,path))
243     command = command.encode('utf-8')
244     'print command'
245     if os.system(command)==0:
246       os.remove(datafile)
247       db.ChangeBookFormat(id,'FB2.ZIP','FB2')
248       db.Commit()
249
250 def CompressAll(limit=100):
251   ids = db.ListByFormat('FB2',limit)
252   for id in ids:
253     try:
254       CompressBook(id[0])
255     except:
256       pass  
257
258 def CheckFiles(delete = 0):
259   ids = db.ListByFormat('FB2',300000)
260   cnt = 0
261   for id in ids:
262     cnt = cnt + 1;
263     sys.stdout.write("\r%s"%(cnt))
264     datafile = os.path.join(db.file_root,db.PathByID(id[0]),db.DataByID(id[0],'FB2'))
265     if not os.path.exists(datafile):
266       print "\r File %s not found" % (datafile)
267       if delete==1:
268         db.DelBook(id)
269         db.Commit()
270   ids = db.ListByFormat('FB2.ZIP',300000)
271   cnt = 0
272   for id in ids:
273     cnt = cnt + 1;
274     sys.stdout.write("\r%s"%(cnt))
275     datafile = os.path.join(db.file_root,db.PathByID(id[0]),db.DataByID(id[0],'FB2.ZIP'))
276     if not os.path.exists(datafile):
277       print "\r File %s not found" % (datafile)
278       if delete==1:
279         db.DelBook(id)
280         db.Commit()
281
282 def RemoveDups(limit = 100):
283   if limit<2:
284     return
285   id_to_del=set([]) 
286   recs = db.ListDups(limit);
287   for rec in recs:
288     ids = db.ListByTitleAndAuthor(rec[0],rec[1],rec[2])
289     for id in ids:
290       id_to_del.add(id)
291   for id in id_to_del:
292     print "\r Deleting %s..." % (id)
293     DelBook(id)
294
295 def main():
296   print "Processing...\r"
297   ProcessDir(db.tmp_files)
298   CompressAll(2000)
299
300 if __name__ == "__main__":
301     import sys
302     reload(sys)
303     sys.setdefaultencoding('utf-8')
304     main()
305