OpenLibrary files

author Roman Bazalevsky <rvb@rvb.name>

Fri, 29 Nov 2013 13:54:30 +0000 (17:54 +0400)

committer Roman Bazalevsky <rvb@rvb.name>

Fri, 29 Nov 2013 13:54:30 +0000 (17:54 +0400)
author Roman Bazalevsky <rvb@rvb.name>
Fri, 29 Nov 2013 13:54:30 +0000 (17:54 +0400)
committer Roman Bazalevsky <rvb@rvb.name>
Fri, 29 Nov 2013 13:54:30 +0000 (17:54 +0400)
diff --git a/check-upload.sh b/check-upload.sh

new file mode 100755 (executable)

index 0000000..604c6e8
--- /dev/null
+++ b/check-upload.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+PIDFILE=/var/run/openlib.pid
+
+if [ -e "$PIDFILE" ] ; then
+    # our pidfile exists, let's make sure the process is still running though
+    PID=`/bin/cat "$PIDFILE"`
+    if /bin/kill -0 "$PID" > /dev/null 2>&1 ; then
+        # indeed it is, i'm outta here!
+        /bin/echo 'Previous instance running...'
+        exit 0
+    fi
+ fi
+
+# create or update the pidfile
+/bin/echo "$$" > $PIDFILE
+
+. /etc/openlib.conf
+
+cd $upload
+find $upload -type f -name "*.zip" -mmin +10 -exec sh -c 'unzip $1 && rm $1' _ {} \; 
+find $upload -type f -name "*.fb2" ! -user www-data -exec chown www-data:users {} \;
+find $upload -type f -name "*.fb2" -user www-data -mmin +10 -exec mv {} $temp \;
+
+cd /opt/openlibrary
+/opt/openlibrary/fb2_process.py
+
+/bin/rm -f "$PIDFILE"
+
+exit 0
diff --git a/db.py b/db.py

new file mode 100644 (file)

index 0000000..84a999d
--- /dev/null
+++ b/db.py
@@ -0,0 +1,243 @@
+#!/usr/bin/python
+
+import MySQLdb
+import ConfigParser
+
+def SortName(name):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT metadata.SortStr(%s)', (name))
+    return c.fetchone()[0]
+  else:
+    print "No connection to DB"
+    exit()
+
+def SortAuthorName(name):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT metadata.SortAuthor(%s)', (name))
+    return c.fetchone()[0]
+  else:
+    print "No connection to DB"
+    exit()
+
+def GetOrCreateAuthor(name):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT metadata.GetOrCreateAuthor(%s)', (name))
+    return c.fetchone()[0]
+  else:
+    print "No connection to DB"
+    exit()
+
+def GetOrCreateLang(name):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT metadata.GetOrCreateLang(%s)', (name))
+    return c.fetchone()[0]
+  else:
+    print "No connection to DB"
+    exit()
+
+def GetOrCreatePublisher(name):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT metadata.GetOrCreatePublisher(%s)', (name))
+    return c.fetchone()[0]
+  else:
+    print "No connection to DB"
+    exit()
+
+def GetOrCreateSeries(name):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT metadata.GetOrCreateSeries(%s)', (name))
+    return c.fetchone()[0]
+  else:
+    print "No connection to DB"
+    exit()
+
+def GetOrCreateTag(name):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT metadata.GetOrCreateTag(%s)', (name))
+    return c.fetchone()[0]
+  else:
+    print "No connection to DB"
+    exit()
+
+def CreateBook(title,pubdate,series_index,isbn):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT metadata.CreateBook(%s,%s,%s,%s)', (title,pubdate,series_index,isbn))
+    return c.fetchone()[0]
+  else:
+    print "No connection to DB"
+    exit()
+
+def LinkBookToAuthors(book_id,author_ids):
+  if database:
+    c = database.cursor()
+    for author_id in author_ids:
+      c.execute('INSERT INTO metadata.books_authors_link(book,author) VALUES (%s,%s)', (book_id,author_id))
+  else:
+    print "No connection to DB"
+    exit()
+  
+def LinkBookToLangs(book_id,lang_ids):
+  if database:
+    c = database.cursor()
+    io = 0
+    for lang_id in lang_ids:
+      io = io + 1
+      c.execute('INSERT INTO metadata.books_languages_link(book,lang_code,item_order) VALUES (%s,%s,%s)', (book_id,lang_id,io))
+  else:
+    print "No connection to DB"
+    exit()
+
+def LinkBookToPublishers(book_id,pub_id):
+  if database:
+    c = database.cursor()
+    c.execute('INSERT INTO metadata.books_publishers_link(book,publisher) VALUES (%s,%s)', (book_id,pub_id))
+  else:
+    print "No connection to DB"
+    exit()
+
+def LinkBookToSeries(book_id,ser_id):
+  if database:
+    c = database.cursor()
+    c.execute('INSERT INTO metadata.books_series_link(book,series) VALUES (%s,%s)', (book_id,ser_id))
+  else:
+    print "No connection to DB"
+    exit()
+
+def LinkBookToTags(book_id,tag_ids):
+  if database:
+    c = database.cursor()
+    for tag_id in tag_ids:
+      c.execute('INSERT INTO metadata.books_tags_link(book,tag) VALUES (%s,%s)', (book_id,tag_id))
+  else:
+    print "No connection to DB"
+    exit()
+
+def SetPath(book_id,path,dataname,filesize,cover):
+  if database:
+    c = database.cursor()
+    c.execute('UPDATE metadata.books SET path=%s, has_cover=%s WHERE id=%s', (path,cover,book_id))
+    c.execute('INSERT INTO metadata.data(book,format,uncompressed_size,name) values (%s,%s,%s,%s)',(book_id,'FB2',filesize,dataname))
+  else:
+    print "No connection to DB"
+    exit()
+
+def StoreComment(book_id,comment):
+  if database:
+    c = database.cursor()
+    c.execute('INSERT INTO metadata.comments(book,text) values (%s,%s)',(book_id,comment))
+  else:
+    print "No connection to DB"
+    exit()
+  
+def PathByID(book_id):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT path FROM metadata.books WHERE id=%s',(book_id))
+    return c.fetchone()[0]
+  else:
+    print "No connection to DB"
+    exit()
+
+def DataByID(book_id,format):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT name FROM metadata.data WHERE book=%s and format=%s',(book_id,format))
+    return c.fetchone()[0]+'.'+format.lower()
+  else:
+    print "No connection to DB"
+    exit()
+
+def DelBook(book_id):
+  if database:
+    c = database.cursor()
+    c.execute('DELETE FROM metadata.books WHERE id=%s',(book_id))
+  else:
+    print "No connection to DB"
+    exit()
+
+def ChangeBookFormat(book_id,old_format,new_format):
+  if database:
+    c = database.cursor()
+    c.execute('UPDATE metadata.data SET format=%s WHERE book=%s and format=%s',(new_format,book_id,old_format))
+  else:
+    print "No connection to DB"
+    exit()
+
+def TestArchive(name):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT count(*) from metadata.processed_archives WHERE filename=%s',(name))
+    return c.fetchone()[0]
+  else:
+    print "No connection to DB"
+    exit()
+
+def MarkArchive(name):
+  if database:
+    c = database.cursor()
+    c.execute('insert into metadata.processed_archives(filename) values (%s)',(name))
+  else:
+    print "No connection to DB"
+    exit()
+
+def ListByFormat(format,limit=100):
+  if database:
+    c = database.cursor()
+    c.execute('SELECT DISTINCT book FROM metadata.data WHERE format=%s ORDER BY book LIMIT 0,%s',(format,limit))
+    return c.fetchall()
+  else:
+    print "No connection to DB"
+    exit()
+
+def Commit():
+  if database:
+    database.commit()
+  else:
+    print "No connection to DB"
+    exit()
+
+def Rollback():
+  if database:
+    database.rollback()
+  else:
+    print "No connection to DB"
+    exit()
+
+
+try:
+
+  cfg = ConfigParser.RawConfigParser(allow_no_value=True)
+  cfg.readfp(open('/etc/openlib.conf'))
+  dbhost = cfg.get("mysql","host")
+  dbuser = cfg.get("mysql","user")
+  dbpasswd = cfg.get("mysql","passwd")
+  file_root = cfg.get("storage","files")
+  tmp_files = cfg.get("storage","temp")
+  failed_files = cfg.get("storage","failed")
+  upload_files = cfg.get("storage","upload")
+   
+except:
+
+  print "Error reading configuration file"
+  exit()
+
+try:
+
+  database = MySQLdb.connect(host=dbhost,user=dbuser,passwd=dbpasswd,use_unicode=True)
+  database.set_character_set('utf8')
+  c = database.cursor()
+  c.execute('SET NAMES utf8;')
+
+except:
+
+  print "Error connecting database"
+  exit()
+        
+\ No newline at end of file
diff --git a/fb2_meta.py b/fb2_meta.py

new file mode 100644 (file)

index 0000000..394ae51
--- /dev/null
+++ b/fb2_meta.py
@@ -0,0 +1,325 @@
+from __future__ import with_statement
+__license__   = 'GPL v3'
+__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
+                '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
+'''Read meta information from fb2 files'''
+
+
+# TODO clean up and save only needed (sorry for this code:) )
+
+# -------------------------------------------
+
+#From calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head:
+#             /src/calibre/ebooks/metadata/fb2.py
+#Based on revision 10897
+
+import os
+import datetime
+from functools import partial
+# from base64 import b64decode
+from lxml import etree
+#from calibre.utils.date import parse_date
+#from calibre import guess_type, guess_all_extensions, prints, force_unicode
+#from calibre.ebooks.metadata import MetaInformation, check_isbn
+#from calibre.ebooks.chardet import xml_to_unicode
+
+
+# -------------------------------------------
+
+def force_unicode(text):
+    if not isinstance(text, unicode):
+        uni = unicode(text, encoding='utf-8')
+    else:
+        uni = text
+    return uni
+
+# -------------------------------------------
+# from calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head:
+#              /src/calibre/ebooks/chardet/__init__.py
+# Based on rev 10897
+
+import re, codecs
+ENCODING_PATS = [
+                 re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
+                            re.IGNORECASE),
+                 re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
+                            re.IGNORECASE),
+                 ]
+
+def strip_encoding_declarations(raw):
+    limit = 50*1024
+    for pat in ENCODING_PATS:
+        prefix = raw[:limit]
+        suffix = raw[limit:]
+        prefix = pat.sub('', prefix)
+        raw = prefix + suffix
+    return raw
+
+def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
+                   resolve_entities=False, assume_utf8=False):
+    '''
+    Force conversion of byte string to unicode. Tries to look for XML/HTML
+    encoding declaration first, if not found uses the chardet library and
+    prints a warning if detection confidence is < 100%
+    @return: (unicode, encoding used)
+    '''
+    encoding = None
+    if not raw:
+        return u'', encoding
+    if not isinstance(raw, unicode):
+        if raw.startswith(codecs.BOM_UTF8):
+            raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
+        elif raw.startswith(codecs.BOM_UTF16_LE):
+            raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
+        elif raw.startswith(codecs.BOM_UTF16_BE):
+            raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
+    if not isinstance(raw, unicode):
+        for pat in ENCODING_PATS:
+            match = pat.search(raw)
+            if match:
+                encoding = match.group(1)
+                break
+        if encoding is None:
+            encoding = 'utf-8'
+
+        try:
+            if encoding.lower().strip() == 'macintosh':
+                encoding = 'mac-roman'
+            if encoding.lower().replace('_', '-').strip() in (
+                    'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
+                    'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
+                # Microsoft Word exports to HTML with encoding incorrectly set to
+                # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
+                encoding = 'gbk'
+            raw = raw.decode(encoding, 'replace')
+        except LookupError:
+            encoding = 'utf-8'
+            raw = raw.decode(encoding, 'replace')
+
+    if strip_encoding_pats:
+        raw = strip_encoding_declarations(raw)
+    #if resolve_entities:
+    #    raw = substitute_entites(raw)
+
+    return raw, encoding
+
+
+# -------------------------------------------
+
+NAMESPACES = {
+    'fb2'   :   'http://www.gribuser.ru/xml/fictionbook/2.0',
+    'xlink' :   'http://www.w3.org/1999/xlink'  }
+
+XPath = partial(etree.XPath, namespaces=NAMESPACES)
+tostring = partial(etree.tostring, method='text', encoding=unicode)
+
+def get_metadata(stream):
+    ''' Return fb2 metadata as a L{MetaInformation} object '''
+
+    mi = type('lamdbaobject', (object,), {})()
+
+    root = _get_fbroot(stream)
+    if root is None:
+        return mi
+
+    book_title = _parse_book_title(root)
+    authors = _parse_authors(root)
+
+    # fallback for book_title
+    if book_title:
+        book_title = unicode(book_title)
+    else:
+#        book_title = force_unicode(os.path.splitext(
+#            os.path.basename(getattr(stream, 'name',
+#                _('Unknown'))))[0])
+        book_title = force_unicode(getattr(stream, 'name'))
+    mi.book_title = book_title
+    mi.authors = authors
+
+    try:
+        _parse_comments(root, mi)
+    except:
+        pass
+    try:
+        _parse_tags(root, mi)
+    except:
+        pass
+    try:
+        _parse_series(root, mi)
+    except:
+        pass
+    try:
+        _parse_isbn(root, mi)
+    except:
+        pass
+    try:
+        _parse_publisher(root, mi)
+    except:
+        pass
+    try:
+        _parse_pubdate(root, mi)
+    except:
+        pass
+#    try:
+#        _parse_timestamp(root, mi)
+#    except:
+#        pass
+
+    try:
+        _parse_language(root, mi)
+    except:
+        pass
+    
+    try:
+        _parse_cover_data(root,'cover.jpg',mi)
+    except:
+        pass    
+    #_parse_uuid(root, mi)
+
+    #if DEBUG:
+    #   prints(mi)
+    return mi
+
+def _parse_authors(root):
+    authors = []
+    # pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
+    # Those are fallbacks: <src-title-info>, <document-info>
+    for author_sec in ['title-info', 'src-title-info']:
+        for au in XPath('//fb2:%s/fb2:author'%author_sec)(root):
+            author = _parse_author(au)
+            if author:
+                authors.append(author)
+                break
+
+    # if no author so far
+    if not authors:
+        #authors.append(_('Unknown'))
+        authors.append('Unknown')
+
+    return authors
+
+def _parse_author(elm_author):
+    """ Returns a list of display author and sortable author"""
+
+    xp_templ = 'normalize-space(fb2:%s/text())'
+
+    author = XPath(xp_templ % 'first-name')(elm_author)
+    lname = XPath(xp_templ % 'last-name')(elm_author)
+    mname = XPath(xp_templ % 'middle-name')(elm_author)
+
+    if mname:
+        author = (author + ' ' + mname).strip()
+    if lname:
+        author = (author + ' ' + lname).strip()
+
+    # fallback to nickname
+    if not author:
+        nname = XPath(xp_templ % 'nickname')(elm_author)
+        if nname:
+            author = nname
+
+    return author
+
+
+def _parse_book_title(root):
+    # <title-info> has a priority.   (actually <title-info>  is mandatory)
+    # other are backup solution (sequence is important. other then in fb2-doc)
+    xp_ti = '//fb2:title-info/fb2:book-title/text()'
+    xp_pi = '//fb2:publish-info/fb2:book-title/text()'
+    xp_si = '//fb2:src-title-info/fb2:book-title/text()'
+    book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
+
+    return book_title
+
+#TODO add from calibre
+def _parse_cover_data(root, imgid, mi):
+    elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
+    if elm_binary:
+        mimetype = elm_binary[0].get('content-type', 'image/jpeg')
+        pic_data = elm_binary[0].text
+        mi.cover = pic_data
+
+def _parse_tags(root, mi):
+    # pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
+    # Those are fallbacks: <src-title-info>
+    for genre_sec in ['title-info', 'src-title-info']:
+        # -- i18n Translations-- ?
+        tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root)
+        if tags:
+            mi.tags = list(map(unicode, tags))
+            break
+
+def _parse_series(root, mi):
+    # calibri supports only 1 series: use the 1-st one
+    # pick up sequence but only from 1 secrion in prefered order
+    # except <src-title-info>
+
+    #TODO parse all
+    xp_ti = '//fb2:title-info/fb2:sequence[1]'
+    xp_pi = '//fb2:publish-info/fb2:sequence[1]'
+
+    elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root)
+    if elms_sequence:
+        mi.series = elms_sequence[0].get('name', None)
+        if mi.series:
+            mi.series_index = elms_sequence[0].get('number', None)
+
+def _parse_isbn(root, mi):
+    # some people try to put several isbn in this field, but it is not allowed.  try to stick to the 1-st one in this case
+    isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
+    if isbn:
+        # some people try to put several isbn in this field, but it is not allowed.  try to stick to the 1-st one in this case
+        if ',' in isbn:
+            isbn = isbn[:isbn.index(',')]
+
+        #TODO add from calibre
+        #if check_isbn(isbn):
+        mi.isbn = isbn
+
+def _parse_comments(root, mi):
+    # pick up annotation but only from 1 secrion <title-info>;  fallback: <src-title-info>
+    for annotation_sec in ['title-info', 'src-title-info']:
+        elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root)
+        if elms_annotation:
+            mi.comments = tostring(elms_annotation[0])
+            # TODO: tags i18n, xslt?
+            break
+
+def _parse_publisher(root, mi):
+    publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root)
+    if publisher:
+        mi.publisher = publisher
+
+def _parse_pubdate(root, mi):
+    year = XPath('number(//fb2:publish-info/fb2:year/text())')(root)
+    if float.is_integer(year):
+        # only year is available, so use 1-st of Jan
+        mi.pubdate = datetime.date(int(year), 1, 1)
+
+def _parse_timestamp(root, mi):
+    #<date value="1996-12-03">03.12.1996</date>
+    xp ='//fb2:document-info/fb2:date/@value|'\
+        '//fb2:document-info/fb2:date/text()'
+    docdate = XPath('string(%s)' % xp)(root)
+    if docdate:
+        #TODO add from calibre
+        #mi.timestamp = parse_date(docdate)
+        mi.timestamp = docdate
+
+def _parse_language(root, mi):
+    language = XPath('string(//fb2:title-info/fb2:lang/text())')(root)
+    if language:
+        mi.language = language
+        mi.languages = [ language ]
+
+def _parse_uuid(root, mi):
+    uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root)
+    if uuid:
+        mi.uuid = uuid
+
+def _get_fbroot(stream):
+    parser = etree.XMLParser(recover=True, no_network=True)
+    raw = stream.read()
+    raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
+    root = etree.fromstring(raw, parser=parser)
+    return root
diff --git a/fb2_process.py b/fb2_process.py

new file mode 100755 (executable)

index 0000000..38d8876
--- /dev/null
+++ b/fb2_process.py
@@ -0,0 +1,291 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import re
+import db
+import fb2_meta
+import os
+import sys
+import shutil
+
+mapping = None
+
+def GetTagsMapping(db):
+  global mapping
+  c = db.cursor()
+  c.execute("select tag_mask,tag_result from metadata.tags_mapping")
+  mapping = c.fetchall()
+  result = []
+  for item in mapping:
+    result.append((re.compile(item[0]),item[1].encode('utf-8')))
+  mapping = result
+  
+def Matches(tag):
+  global mapping
+  for item in mapping:
+    if item[0].match(tag):
+      return item[1]
+  return tag
+
+def NewTags(tags):
+  global mapping
+  if not mapping:
+    GetTagsMapping(db.database)
+  new_tags = set()
+  for item in tags:
+    new_tags.add(Matches(item))
+  return list(new_tags)
+
+class MetaData:
+
+  def GetPath(self):
+    self.sort_title = db.SortName(self.title).replace(' ','_');
+    self.sort_author = db.SortAuthorName(self.author).replace(' ','_');
+    self.path = ("%s/%s/%s/%s/%s (%d)" % (self.sort_author[0],self.sort_author[0:2],self.sort_author[0:4],self.sort_author[0:32],self.sort_title[0:64],self.book_id))
+    self.dataname = (self.title.decode('utf-8')[0:64]+' '+self.author.decode('utf-8')[0:32]).replace('/','')
+
+  def __init__(self,meta_dict,size):
+
+    self.size = size
+    try:
+      tags=NewTags(meta_dict['tags'])
+    except:
+      tags=['other']  
+    if 'trash' in tags:
+      self.state="trash"
+      return
+    self.tags=tags
+    try:
+      tag=tags[0]
+    except:
+      tag='Жанр неизвестен'  
+
+    try:
+      self.author=meta_dict['authors'][0].encode('utf-8')
+      self.authors = meta_dict['authors']
+      author_ids = set()
+      try:
+        for author in self.authors:
+          author_ids.add(db.GetOrCreateAuthor(author.encode('utf-8')))
+      except:
+        pass    
+    except:
+      self.author='Неизвестный Автор (%s)' % (tag)
+      self.authors = []
+      author_ids = []
+
+    try:
+      try:
+        self.langs=meta_dict['languages']    
+      except:
+        self.langs=['ru']  
+      lang_ids = set()
+      for lang in meta_dict['languages']:
+        lang_ids.add(db.GetOrCreateLang(lang.encode('utf-8')))
+    except:
+      pass
+      
+    
+    try:
+      self.publisher = meta_dict['publisher'].encode('utf-8')
+      pub_id=db.GetOrCreatePublisher(self.publisher)
+    except:
+      pub_id=None
+ 
+    try:
+      title = meta_dict['book_title'].encode('utf-8')
+    except:
+      title='Название неизвестно'
+    self.title=title
+
+    try:
+      pub_date=meta_dict['pubdate']
+    except:
+      pub_date=None
+    self.pub_date=pub_date
+
+    try:
+      isbn=meta_dict['isbn'].encode('utf-8')
+    except:
+      isbn=None
+    self.isbn=isbn
+
+    try:
+      self.series = meta_dict['series'].encode('utf-8')
+      ser_id=db.GetOrCreateSeries(meta_dict['series'])
+    except:
+      ser_id=None
+    try:
+      self.series_idx = meta_dict['series_index']
+      ser_num=meta_dict['series_index']
+    except:
+      ser_num=None
+
+    tag_ids = set()
+    try:
+      for tag in tags:
+        tag_ids.add(db.GetOrCreateTag(tag))
+    except:
+      pass
+      
+    try:
+      self.cover=meta_dict['cover']
+      self.has_cover=1
+    except:
+      self.has_cover=0        
+
+    try:
+      self.comments=meta_dict['comments'].encode('utf-8')
+    except:
+      self.comments=''  
+
+    book_id = db.CreateBook(title,pub_date,ser_num,isbn)
+    self.book_id = book_id
+   
+    db.LinkBookToAuthors(book_id,author_ids);
+    db.LinkBookToLangs(book_id,lang_ids);
+    if pub_id:
+      db.LinkBookToPublishers(book_id,pub_id);
+    if ser_id:
+      db.LinkBookToSeries(book_id,ser_id);
+    db.LinkBookToTags(book_id,tag_ids);
+    if self.comments:
+      db.StoreComment(book_id,self.comments)
+  
+    self.GetPath()
+    db.SetPath(self.book_id,self.path,self.dataname,self.size,self.has_cover);
+    self.state="done"
+
+def ProcessFile(filename):
+
+  size = os.path.getsize(filename)
+  stream = open(filename)
+  meta = fb2_meta.get_metadata(stream)
+  stream.close()
+
+  try:
+    book = MetaData(meta.__dict__,size) 
+
+    if book.state=="done":
+
+      new_file_path = db.file_root + book.path + '/' + book.dataname + '.fb2'
+      cover_path = db.file_root + book.path + '/cover.jpg'
+      new_dir_path = db.file_root + book.path 
+
+      os.makedirs(new_dir_path,0755)
+      shutil.move(filename,new_file_path)
+
+      if book.has_cover:
+        cover_path = new_dir_path + '/cover.jpg'
+        print "Book has cover, try to store to "+cover_path
+        coverfile = open(cover_path,'w')
+        coverfile.write(book.cover.decode('base64'))
+        coverfile.close()
+
+      db.Commit()
+      print "Moved to "+new_dir_path
+
+    elif book.state=="trash":
+   
+      print "Tags blacklisted, trashing"
+      os.remove(filename) 
+    
+    else: 
+    
+      shutil.move(filename,db.failed_files+os.path.basename(filename))
+      print "Moved to failed_files"
+      db.Rollback()  
+    
+  except:
+
+    shutil.move(filename,db.failed_files+os.path.basename(filename))
+    print "Moved to failed_files"
+    db.Rollback()  
+
+def ProcessDir(dirname):
+  for file in os.listdir(dirname):
+    if file.endswith(".fb2"):
+      print "Processing "+file
+      ProcessFile(os.path.join(dirname,file))
+
+def DelBook(id):
+  path = os.path.join(db.file_root,db.PathByID(id))
+  if path:
+    for file in os.listdir(path):
+      os.remove(os.path.join(path,file))
+    db.DelBook(id)
+    os.rmdir(path)
+    db.Commit()  
+
+def CompressBook(id):
+  path=db.PathByID(id)
+  if path:
+    datafile = os.path.join(db.file_root,path,db.DataByID(id,'FB2'))
+    datapath = datafile.replace("\"","\\\"")
+    datapath=datapath.replace("`","\`")
+    datapath=datapath.replace("$","\$")
+    zipfile = datapath + '.zip'
+    command = "zip --move --junk-paths \"%s\" \"%s\"" % (zipfile,datapath)
+    command = command.encode('utf-8')
+    print command
+    if os.system(command)==0:
+      db.ChangeBookFormat(id,'FB2','FB2.ZIP')
+      db.Commit()
+
+def UnCompressBook(id):
+  path=db.PathByID(id)
+  if path:
+    datafile = os.path.join(db.file_root,path,db.DataByID(id,'FB2.ZIP'))
+    datapath = datafile.replace("\"","\\\"")
+    datapath=datapath.replace("`","\`")
+    datapath=datapath.replace("$","\$")
+    command = "unzip  \"%s\" -d \"%s\"" % (datapath,os.path.join(db.file_root,path))
+    command = command.encode('utf-8')
+    'print command'
+    if os.system(command)==0:
+      os.remove(datafile)
+      db.ChangeBookFormat(id,'FB2.ZIP','FB2')
+      db.Commit()
+
+def CompressAll(limit=100):
+  ids = db.ListByFormat('FB2',limit)
+  for id in ids:
+    try:
+      CompressBook(id[0])
+    except:
+      pass  
+
+def CheckFiles(delete = 0):
+  ids = db.ListByFormat('FB2',300000)
+  cnt = 0
+  for id in ids:
+    cnt = cnt + 1;
+    sys.stdout.write("\r%s"%(cnt))
+    datafile = os.path.join(db.file_root,db.PathByID(id[0]),db.DataByID(id[0],'FB2'))
+    if not os.path.exists(datafile):
+      print "\r File %s not found" % (datafile)
+      if delete==1:
+        db.DelBook(id)
+        db.Commit()
+  ids = db.ListByFormat('FB2.ZIP',300000)
+  cnt = 0
+  for id in ids:
+    cnt = cnt + 1;
+    sys.stdout.write("\r%s"%(cnt))
+    datafile = os.path.join(db.file_root,db.PathByID(id[0]),db.DataByID(id[0],'FB2.ZIP'))
+    if not os.path.exists(datafile):
+      print "\r File %s not found" % (datafile)
+      if delete==1:
+        db.DelBook(id)
+        db.Commit()
+
+def main():
+  ProcessDir(db.tmp_files)
+  CompressAll(2000)
+
+if __name__ == "__main__":
+    import sys
+    reload(sys)
+    sys.setdefaultencoding('utf-8')
+    main()
+               
+\ No newline at end of file
diff --git a/get_flibusta.py b/get_flibusta.py

new file mode 100755 (executable)

index 0000000..654b5be
--- /dev/null
+++ b/get_flibusta.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+
+from BeautifulSoup import BeautifulSoup
+import urllib
+import db
+import re
+import os
+
+pattern = re.compile("^f.fb2.*")
+os.environ['http_proxy']='http://localhost:3128'
+os.environ['no_proxy']='localhost,127.0.0.1'
+proxies = {'http': 'http://localhost:3128'}
+
+
+for host in ['flibusta.net','flibustahezeous3.onion','flibusta.i2p']:
+
+  try:
+    print "Trying %s" % (host)
+    html_page = urllib.urlopen("http://%s/daily/" % (host))
+    html = BeautifulSoup(html_page)
+
+    os_command = "wget -c -P \"%s\" http://%s/daily/%s" % (db.upload_files,host,'%s')
+    print os_command   
+    matched = False 
+
+    for link in html.findAll('a'):
+      file = link.get("href")
+      print file
+      if pattern.match(file):
+        matched = True
+        if not db.TestArchive(file):
+          print "Processing %s" % file
+          if os.system(os_command % file) == 0:
+            db.MarkArchive(file)  
+            db.Commit()
+  except:
+    matched = False
+
+  if matched:
+    break
+
+if matched:
+  print "Got from %s" % host
+else:
+  print "Failed to get"
author	Roman Bazalevsky <rvb@rvb.name>
	Fri, 29 Nov 2013 13:54:30 +0000 (17:54 +0400)
committer	Roman Bazalevsky <rvb@rvb.name>
	Fri, 29 Nov 2013 13:54:30 +0000 (17:54 +0400)
check-upload.sh	[new file with mode: 0755]	patch \| blob
db.py	[new file with mode: 0644]	patch \| blob
fb2_meta.py	[new file with mode: 0644]	patch \| blob
fb2_process.py	[new file with mode: 0755]	patch \| blob
get_flibusta.py	[new file with mode: 0755]	patch \| blob