From: Roman Bazalevsky Date: Fri, 29 Nov 2013 13:54:30 +0000 (+0400) Subject: OpenLibrary files X-Git-Url: https://git.rvb.name/openlib.git/commitdiff_plain/169adb0ddaf8cc3eedfa15b500e14570bda0cdee?ds=sidebyside OpenLibrary files --- 169adb0ddaf8cc3eedfa15b500e14570bda0cdee diff --git a/check-upload.sh b/check-upload.sh new file mode 100755 index 0000000..604c6e8 --- /dev/null +++ b/check-upload.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +PIDFILE=/var/run/openlib.pid + +if [ -e "$PIDFILE" ] ; then + # our pidfile exists, let's make sure the process is still running though + PID=`/bin/cat "$PIDFILE"` + if /bin/kill -0 "$PID" > /dev/null 2>&1 ; then + # indeed it is, i'm outta here! + /bin/echo 'Previous instance running...' + exit 0 + fi + fi + +# create or update the pidfile +/bin/echo "$$" > $PIDFILE + +. /etc/openlib.conf + +cd $upload +find $upload -type f -name "*.zip" -mmin +10 -exec sh -c 'unzip $1 && rm $1' _ {} \; +find $upload -type f -name "*.fb2" ! -user www-data -exec chown www-data:users {} \; +find $upload -type f -name "*.fb2" -user www-data -mmin +10 -exec mv {} $temp \; + +cd /opt/openlibrary +/opt/openlibrary/fb2_process.py + +/bin/rm -f "$PIDFILE" + +exit 0 diff --git a/db.py b/db.py new file mode 100644 index 0000000..84a999d --- /dev/null +++ b/db.py @@ -0,0 +1,243 @@ +#!/usr/bin/python + +import MySQLdb +import ConfigParser + +def SortName(name): + if database: + c = database.cursor() + c.execute('SELECT metadata.SortStr(%s)', (name)) + return c.fetchone()[0] + else: + print "No connection to DB" + exit() + +def SortAuthorName(name): + if database: + c = database.cursor() + c.execute('SELECT metadata.SortAuthor(%s)', (name)) + return c.fetchone()[0] + else: + print "No connection to DB" + exit() + +def GetOrCreateAuthor(name): + if database: + c = database.cursor() + c.execute('SELECT metadata.GetOrCreateAuthor(%s)', (name)) + return c.fetchone()[0] + else: + print "No connection to DB" + exit() + +def GetOrCreateLang(name): + if database: + c = database.cursor() + c.execute('SELECT metadata.GetOrCreateLang(%s)', (name)) + return c.fetchone()[0] + else: + print "No connection to DB" + exit() + +def GetOrCreatePublisher(name): + if database: + c = database.cursor() + c.execute('SELECT metadata.GetOrCreatePublisher(%s)', (name)) + return c.fetchone()[0] + else: + print "No connection to DB" + exit() + +def GetOrCreateSeries(name): + if database: + c = database.cursor() + c.execute('SELECT metadata.GetOrCreateSeries(%s)', (name)) + return c.fetchone()[0] + else: + print "No connection to DB" + exit() + +def GetOrCreateTag(name): + if database: + c = database.cursor() + c.execute('SELECT metadata.GetOrCreateTag(%s)', (name)) + return c.fetchone()[0] + else: + print "No connection to DB" + exit() + +def CreateBook(title,pubdate,series_index,isbn): + if database: + c = database.cursor() + c.execute('SELECT metadata.CreateBook(%s,%s,%s,%s)', (title,pubdate,series_index,isbn)) + return c.fetchone()[0] + else: + print "No connection to DB" + exit() + +def LinkBookToAuthors(book_id,author_ids): + if database: + c = database.cursor() + for author_id in author_ids: + c.execute('INSERT INTO metadata.books_authors_link(book,author) VALUES (%s,%s)', (book_id,author_id)) + else: + print "No connection to DB" + exit() + +def LinkBookToLangs(book_id,lang_ids): + if database: + c = database.cursor() + io = 0 + for lang_id in lang_ids: + io = io + 1 + c.execute('INSERT INTO metadata.books_languages_link(book,lang_code,item_order) VALUES (%s,%s,%s)', (book_id,lang_id,io)) + else: + print "No connection to DB" + exit() + +def LinkBookToPublishers(book_id,pub_id): + if database: + c = database.cursor() + c.execute('INSERT INTO metadata.books_publishers_link(book,publisher) VALUES (%s,%s)', (book_id,pub_id)) + else: + print "No connection to DB" + exit() + +def LinkBookToSeries(book_id,ser_id): + if database: + c = database.cursor() + c.execute('INSERT INTO metadata.books_series_link(book,series) VALUES (%s,%s)', (book_id,ser_id)) + else: + print "No connection to DB" + exit() + +def LinkBookToTags(book_id,tag_ids): + if database: + c = database.cursor() + for tag_id in tag_ids: + c.execute('INSERT INTO metadata.books_tags_link(book,tag) VALUES (%s,%s)', (book_id,tag_id)) + else: + print "No connection to DB" + exit() + +def SetPath(book_id,path,dataname,filesize,cover): + if database: + c = database.cursor() + c.execute('UPDATE metadata.books SET path=%s, has_cover=%s WHERE id=%s', (path,cover,book_id)) + c.execute('INSERT INTO metadata.data(book,format,uncompressed_size,name) values (%s,%s,%s,%s)',(book_id,'FB2',filesize,dataname)) + else: + print "No connection to DB" + exit() + +def StoreComment(book_id,comment): + if database: + c = database.cursor() + c.execute('INSERT INTO metadata.comments(book,text) values (%s,%s)',(book_id,comment)) + else: + print "No connection to DB" + exit() + +def PathByID(book_id): + if database: + c = database.cursor() + c.execute('SELECT path FROM metadata.books WHERE id=%s',(book_id)) + return c.fetchone()[0] + else: + print "No connection to DB" + exit() + +def DataByID(book_id,format): + if database: + c = database.cursor() + c.execute('SELECT name FROM metadata.data WHERE book=%s and format=%s',(book_id,format)) + return c.fetchone()[0]+'.'+format.lower() + else: + print "No connection to DB" + exit() + +def DelBook(book_id): + if database: + c = database.cursor() + c.execute('DELETE FROM metadata.books WHERE id=%s',(book_id)) + else: + print "No connection to DB" + exit() + +def ChangeBookFormat(book_id,old_format,new_format): + if database: + c = database.cursor() + c.execute('UPDATE metadata.data SET format=%s WHERE book=%s and format=%s',(new_format,book_id,old_format)) + else: + print "No connection to DB" + exit() + +def TestArchive(name): + if database: + c = database.cursor() + c.execute('SELECT count(*) from metadata.processed_archives WHERE filename=%s',(name)) + return c.fetchone()[0] + else: + print "No connection to DB" + exit() + +def MarkArchive(name): + if database: + c = database.cursor() + c.execute('insert into metadata.processed_archives(filename) values (%s)',(name)) + else: + print "No connection to DB" + exit() + +def ListByFormat(format,limit=100): + if database: + c = database.cursor() + c.execute('SELECT DISTINCT book FROM metadata.data WHERE format=%s ORDER BY book LIMIT 0,%s',(format,limit)) + return c.fetchall() + else: + print "No connection to DB" + exit() + +def Commit(): + if database: + database.commit() + else: + print "No connection to DB" + exit() + +def Rollback(): + if database: + database.rollback() + else: + print "No connection to DB" + exit() + + +try: + + cfg = ConfigParser.RawConfigParser(allow_no_value=True) + cfg.readfp(open('/etc/openlib.conf')) + dbhost = cfg.get("mysql","host") + dbuser = cfg.get("mysql","user") + dbpasswd = cfg.get("mysql","passwd") + file_root = cfg.get("storage","files") + tmp_files = cfg.get("storage","temp") + failed_files = cfg.get("storage","failed") + upload_files = cfg.get("storage","upload") + +except: + + print "Error reading configuration file" + exit() + +try: + + database = MySQLdb.connect(host=dbhost,user=dbuser,passwd=dbpasswd,use_unicode=True) + database.set_character_set('utf8') + c = database.cursor() + c.execute('SET NAMES utf8;') + +except: + + print "Error connecting database" + exit() + \ No newline at end of file diff --git a/fb2_meta.py b/fb2_meta.py new file mode 100644 index 0000000..394ae51 --- /dev/null +++ b/fb2_meta.py @@ -0,0 +1,325 @@ +from __future__ import with_statement +__license__ = 'GPL v3' +__copyright__ = '2011, Roman Mukhin , '\ + '2008, Anatoly Shipitsin ' +'''Read meta information from fb2 files''' + + +# TODO clean up and save only needed (sorry for this code:) ) + +# ------------------------------------------- + +#From calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head: +# /src/calibre/ebooks/metadata/fb2.py +#Based on revision 10897 + +import os +import datetime +from functools import partial +# from base64 import b64decode +from lxml import etree +#from calibre.utils.date import parse_date +#from calibre import guess_type, guess_all_extensions, prints, force_unicode +#from calibre.ebooks.metadata import MetaInformation, check_isbn +#from calibre.ebooks.chardet import xml_to_unicode + + +# ------------------------------------------- + +def force_unicode(text): + if not isinstance(text, unicode): + uni = unicode(text, encoding='utf-8') + else: + uni = text + return uni + +# ------------------------------------------- +# from calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head: +# /src/calibre/ebooks/chardet/__init__.py +# Based on rev 10897 + +import re, codecs +ENCODING_PATS = [ + re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', + re.IGNORECASE), + re.compile(r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''', + re.IGNORECASE), + ] + +def strip_encoding_declarations(raw): + limit = 50*1024 + for pat in ENCODING_PATS: + prefix = raw[:limit] + suffix = raw[limit:] + prefix = pat.sub('', prefix) + raw = prefix + suffix + return raw + +def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, + resolve_entities=False, assume_utf8=False): + ''' + Force conversion of byte string to unicode. Tries to look for XML/HTML + encoding declaration first, if not found uses the chardet library and + prints a warning if detection confidence is < 100% + @return: (unicode, encoding used) + ''' + encoding = None + if not raw: + return u'', encoding + if not isinstance(raw, unicode): + if raw.startswith(codecs.BOM_UTF8): + raw, encoding = raw.decode('utf-8')[1:], 'utf-8' + elif raw.startswith(codecs.BOM_UTF16_LE): + raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le' + elif raw.startswith(codecs.BOM_UTF16_BE): + raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be' + if not isinstance(raw, unicode): + for pat in ENCODING_PATS: + match = pat.search(raw) + if match: + encoding = match.group(1) + break + if encoding is None: + encoding = 'utf-8' + + try: + if encoding.lower().strip() == 'macintosh': + encoding = 'mac-roman' + if encoding.lower().replace('_', '-').strip() in ( + 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', + 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): + # Microsoft Word exports to HTML with encoding incorrectly set to + # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. + encoding = 'gbk' + raw = raw.decode(encoding, 'replace') + except LookupError: + encoding = 'utf-8' + raw = raw.decode(encoding, 'replace') + + if strip_encoding_pats: + raw = strip_encoding_declarations(raw) + #if resolve_entities: + # raw = substitute_entites(raw) + + return raw, encoding + + +# ------------------------------------------- + +NAMESPACES = { + 'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0', + 'xlink' : 'http://www.w3.org/1999/xlink' } + +XPath = partial(etree.XPath, namespaces=NAMESPACES) +tostring = partial(etree.tostring, method='text', encoding=unicode) + +def get_metadata(stream): + ''' Return fb2 metadata as a L{MetaInformation} object ''' + + mi = type('lamdbaobject', (object,), {})() + + root = _get_fbroot(stream) + if root is None: + return mi + + book_title = _parse_book_title(root) + authors = _parse_authors(root) + + # fallback for book_title + if book_title: + book_title = unicode(book_title) + else: +# book_title = force_unicode(os.path.splitext( +# os.path.basename(getattr(stream, 'name', +# _('Unknown'))))[0]) + book_title = force_unicode(getattr(stream, 'name')) + mi.book_title = book_title + mi.authors = authors + + try: + _parse_comments(root, mi) + except: + pass + try: + _parse_tags(root, mi) + except: + pass + try: + _parse_series(root, mi) + except: + pass + try: + _parse_isbn(root, mi) + except: + pass + try: + _parse_publisher(root, mi) + except: + pass + try: + _parse_pubdate(root, mi) + except: + pass +# try: +# _parse_timestamp(root, mi) +# except: +# pass + + try: + _parse_language(root, mi) + except: + pass + + try: + _parse_cover_data(root,'cover.jpg',mi) + except: + pass + #_parse_uuid(root, mi) + + #if DEBUG: + # prints(mi) + return mi + +def _parse_authors(root): + authors = [] + # pick up authors but only from 1 secrion ; otherwise it is not consistent! + # Those are fallbacks: , + for author_sec in ['title-info', 'src-title-info']: + for au in XPath('//fb2:%s/fb2:author'%author_sec)(root): + author = _parse_author(au) + if author: + authors.append(author) + break + + # if no author so far + if not authors: + #authors.append(_('Unknown')) + authors.append('Unknown') + + return authors + +def _parse_author(elm_author): + """ Returns a list of display author and sortable author""" + + xp_templ = 'normalize-space(fb2:%s/text())' + + author = XPath(xp_templ % 'first-name')(elm_author) + lname = XPath(xp_templ % 'last-name')(elm_author) + mname = XPath(xp_templ % 'middle-name')(elm_author) + + if mname: + author = (author + ' ' + mname).strip() + if lname: + author = (author + ' ' + lname).strip() + + # fallback to nickname + if not author: + nname = XPath(xp_templ % 'nickname')(elm_author) + if nname: + author = nname + + return author + + +def _parse_book_title(root): + # has a priority. (actually is mandatory) + # other are backup solution (sequence is important. other then in fb2-doc) + xp_ti = '//fb2:title-info/fb2:book-title/text()' + xp_pi = '//fb2:publish-info/fb2:book-title/text()' + xp_si = '//fb2:src-title-info/fb2:book-title/text()' + book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root) + + return book_title + +#TODO add from calibre +def _parse_cover_data(root, imgid, mi): + elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root) + if elm_binary: + mimetype = elm_binary[0].get('content-type', 'image/jpeg') + pic_data = elm_binary[0].text + mi.cover = pic_data + +def _parse_tags(root, mi): + # pick up genre but only from 1 secrion ; otherwise it is not consistent! + # Those are fallbacks: + for genre_sec in ['title-info', 'src-title-info']: + # -- i18n Translations-- ? + tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root) + if tags: + mi.tags = list(map(unicode, tags)) + break + +def _parse_series(root, mi): + # calibri supports only 1 series: use the 1-st one + # pick up sequence but only from 1 secrion in prefered order + # except + + #TODO parse all + xp_ti = '//fb2:title-info/fb2:sequence[1]' + xp_pi = '//fb2:publish-info/fb2:sequence[1]' + + elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root) + if elms_sequence: + mi.series = elms_sequence[0].get('name', None) + if mi.series: + mi.series_index = elms_sequence[0].get('number', None) + +def _parse_isbn(root, mi): + # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case + isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root) + if isbn: + # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case + if ',' in isbn: + isbn = isbn[:isbn.index(',')] + + #TODO add from calibre + #if check_isbn(isbn): + mi.isbn = isbn + +def _parse_comments(root, mi): + # pick up annotation but only from 1 secrion ; fallback: + for annotation_sec in ['title-info', 'src-title-info']: + elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root) + if elms_annotation: + mi.comments = tostring(elms_annotation[0]) + # TODO: tags i18n, xslt? + break + +def _parse_publisher(root, mi): + publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root) + if publisher: + mi.publisher = publisher + +def _parse_pubdate(root, mi): + year = XPath('number(//fb2:publish-info/fb2:year/text())')(root) + if float.is_integer(year): + # only year is available, so use 1-st of Jan + mi.pubdate = datetime.date(int(year), 1, 1) + +def _parse_timestamp(root, mi): + #03.12.1996 + xp ='//fb2:document-info/fb2:date/@value|'\ + '//fb2:document-info/fb2:date/text()' + docdate = XPath('string(%s)' % xp)(root) + if docdate: + #TODO add from calibre + #mi.timestamp = parse_date(docdate) + mi.timestamp = docdate + +def _parse_language(root, mi): + language = XPath('string(//fb2:title-info/fb2:lang/text())')(root) + if language: + mi.language = language + mi.languages = [ language ] + +def _parse_uuid(root, mi): + uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root) + if uuid: + mi.uuid = uuid + +def _get_fbroot(stream): + parser = etree.XMLParser(recover=True, no_network=True) + raw = stream.read() + raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] + root = etree.fromstring(raw, parser=parser) + return root diff --git a/fb2_process.py b/fb2_process.py new file mode 100755 index 0000000..38d8876 --- /dev/null +++ b/fb2_process.py @@ -0,0 +1,291 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import re +import db +import fb2_meta +import os +import sys +import shutil + +mapping = None + +def GetTagsMapping(db): + global mapping + c = db.cursor() + c.execute("select tag_mask,tag_result from metadata.tags_mapping") + mapping = c.fetchall() + result = [] + for item in mapping: + result.append((re.compile(item[0]),item[1].encode('utf-8'))) + mapping = result + +def Matches(tag): + global mapping + for item in mapping: + if item[0].match(tag): + return item[1] + return tag + +def NewTags(tags): + global mapping + if not mapping: + GetTagsMapping(db.database) + new_tags = set() + for item in tags: + new_tags.add(Matches(item)) + return list(new_tags) + +class MetaData: + + def GetPath(self): + self.sort_title = db.SortName(self.title).replace(' ','_'); + self.sort_author = db.SortAuthorName(self.author).replace(' ','_'); + self.path = ("%s/%s/%s/%s/%s (%d)" % (self.sort_author[0],self.sort_author[0:2],self.sort_author[0:4],self.sort_author[0:32],self.sort_title[0:64],self.book_id)) + self.dataname = (self.title.decode('utf-8')[0:64]+' '+self.author.decode('utf-8')[0:32]).replace('/','') + + def __init__(self,meta_dict,size): + + self.size = size + try: + tags=NewTags(meta_dict['tags']) + except: + tags=['other'] + if 'trash' in tags: + self.state="trash" + return + self.tags=tags + try: + tag=tags[0] + except: + tag='Жанр неизвестен' + + try: + self.author=meta_dict['authors'][0].encode('utf-8') + self.authors = meta_dict['authors'] + author_ids = set() + try: + for author in self.authors: + author_ids.add(db.GetOrCreateAuthor(author.encode('utf-8'))) + except: + pass + except: + self.author='Неизвестный Автор (%s)' % (tag) + self.authors = [] + author_ids = [] + + try: + try: + self.langs=meta_dict['languages'] + except: + self.langs=['ru'] + lang_ids = set() + for lang in meta_dict['languages']: + lang_ids.add(db.GetOrCreateLang(lang.encode('utf-8'))) + except: + pass + + + try: + self.publisher = meta_dict['publisher'].encode('utf-8') + pub_id=db.GetOrCreatePublisher(self.publisher) + except: + pub_id=None + + try: + title = meta_dict['book_title'].encode('utf-8') + except: + title='Название неизвестно' + self.title=title + + try: + pub_date=meta_dict['pubdate'] + except: + pub_date=None + self.pub_date=pub_date + + try: + isbn=meta_dict['isbn'].encode('utf-8') + except: + isbn=None + self.isbn=isbn + + try: + self.series = meta_dict['series'].encode('utf-8') + ser_id=db.GetOrCreateSeries(meta_dict['series']) + except: + ser_id=None + try: + self.series_idx = meta_dict['series_index'] + ser_num=meta_dict['series_index'] + except: + ser_num=None + + tag_ids = set() + try: + for tag in tags: + tag_ids.add(db.GetOrCreateTag(tag)) + except: + pass + + try: + self.cover=meta_dict['cover'] + self.has_cover=1 + except: + self.has_cover=0 + + try: + self.comments=meta_dict['comments'].encode('utf-8') + except: + self.comments='' + + book_id = db.CreateBook(title,pub_date,ser_num,isbn) + self.book_id = book_id + + db.LinkBookToAuthors(book_id,author_ids); + db.LinkBookToLangs(book_id,lang_ids); + if pub_id: + db.LinkBookToPublishers(book_id,pub_id); + if ser_id: + db.LinkBookToSeries(book_id,ser_id); + db.LinkBookToTags(book_id,tag_ids); + if self.comments: + db.StoreComment(book_id,self.comments) + + self.GetPath() + db.SetPath(self.book_id,self.path,self.dataname,self.size,self.has_cover); + self.state="done" + +def ProcessFile(filename): + + size = os.path.getsize(filename) + stream = open(filename) + meta = fb2_meta.get_metadata(stream) + stream.close() + + try: + book = MetaData(meta.__dict__,size) + + if book.state=="done": + + new_file_path = db.file_root + book.path + '/' + book.dataname + '.fb2' + cover_path = db.file_root + book.path + '/cover.jpg' + new_dir_path = db.file_root + book.path + + os.makedirs(new_dir_path,0755) + shutil.move(filename,new_file_path) + + if book.has_cover: + cover_path = new_dir_path + '/cover.jpg' + print "Book has cover, try to store to "+cover_path + coverfile = open(cover_path,'w') + coverfile.write(book.cover.decode('base64')) + coverfile.close() + + db.Commit() + print "Moved to "+new_dir_path + + elif book.state=="trash": + + print "Tags blacklisted, trashing" + os.remove(filename) + + else: + + shutil.move(filename,db.failed_files+os.path.basename(filename)) + print "Moved to failed_files" + db.Rollback() + + except: + + shutil.move(filename,db.failed_files+os.path.basename(filename)) + print "Moved to failed_files" + db.Rollback() + +def ProcessDir(dirname): + for file in os.listdir(dirname): + if file.endswith(".fb2"): + print "Processing "+file + ProcessFile(os.path.join(dirname,file)) + +def DelBook(id): + path = os.path.join(db.file_root,db.PathByID(id)) + if path: + for file in os.listdir(path): + os.remove(os.path.join(path,file)) + db.DelBook(id) + os.rmdir(path) + db.Commit() + +def CompressBook(id): + path=db.PathByID(id) + if path: + datafile = os.path.join(db.file_root,path,db.DataByID(id,'FB2')) + datapath = datafile.replace("\"","\\\"") + datapath=datapath.replace("`","\`") + datapath=datapath.replace("$","\$") + zipfile = datapath + '.zip' + command = "zip --move --junk-paths \"%s\" \"%s\"" % (zipfile,datapath) + command = command.encode('utf-8') + print command + if os.system(command)==0: + db.ChangeBookFormat(id,'FB2','FB2.ZIP') + db.Commit() + +def UnCompressBook(id): + path=db.PathByID(id) + if path: + datafile = os.path.join(db.file_root,path,db.DataByID(id,'FB2.ZIP')) + datapath = datafile.replace("\"","\\\"") + datapath=datapath.replace("`","\`") + datapath=datapath.replace("$","\$") + command = "unzip \"%s\" -d \"%s\"" % (datapath,os.path.join(db.file_root,path)) + command = command.encode('utf-8') + 'print command' + if os.system(command)==0: + os.remove(datafile) + db.ChangeBookFormat(id,'FB2.ZIP','FB2') + db.Commit() + +def CompressAll(limit=100): + ids = db.ListByFormat('FB2',limit) + for id in ids: + try: + CompressBook(id[0]) + except: + pass + +def CheckFiles(delete = 0): + ids = db.ListByFormat('FB2',300000) + cnt = 0 + for id in ids: + cnt = cnt + 1; + sys.stdout.write("\r%s"%(cnt)) + datafile = os.path.join(db.file_root,db.PathByID(id[0]),db.DataByID(id[0],'FB2')) + if not os.path.exists(datafile): + print "\r File %s not found" % (datafile) + if delete==1: + db.DelBook(id) + db.Commit() + ids = db.ListByFormat('FB2.ZIP',300000) + cnt = 0 + for id in ids: + cnt = cnt + 1; + sys.stdout.write("\r%s"%(cnt)) + datafile = os.path.join(db.file_root,db.PathByID(id[0]),db.DataByID(id[0],'FB2.ZIP')) + if not os.path.exists(datafile): + print "\r File %s not found" % (datafile) + if delete==1: + db.DelBook(id) + db.Commit() + +def main(): + ProcessDir(db.tmp_files) + CompressAll(2000) + +if __name__ == "__main__": + import sys + reload(sys) + sys.setdefaultencoding('utf-8') + main() + \ No newline at end of file diff --git a/get_flibusta.py b/get_flibusta.py new file mode 100755 index 0000000..654b5be --- /dev/null +++ b/get_flibusta.py @@ -0,0 +1,45 @@ +#!/usr/bin/python + +from BeautifulSoup import BeautifulSoup +import urllib +import db +import re +import os + +pattern = re.compile("^f.fb2.*") +os.environ['http_proxy']='http://localhost:3128' +os.environ['no_proxy']='localhost,127.0.0.1' +proxies = {'http': 'http://localhost:3128'} + + +for host in ['flibusta.net','flibustahezeous3.onion','flibusta.i2p']: + + try: + print "Trying %s" % (host) + html_page = urllib.urlopen("http://%s/daily/" % (host)) + html = BeautifulSoup(html_page) + + os_command = "wget -c -P \"%s\" http://%s/daily/%s" % (db.upload_files,host,'%s') + print os_command + matched = False + + for link in html.findAll('a'): + file = link.get("href") + print file + if pattern.match(file): + matched = True + if not db.TestArchive(file): + print "Processing %s" % file + if os.system(os_command % file) == 0: + db.MarkArchive(file) + db.Commit() + except: + matched = False + + if matched: + break + +if matched: + print "Got from %s" % host +else: + print "Failed to get"