+from __future__ import with_statement
+__license__ = 'GPL v3'
+__copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
+ '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
+'''Read meta information from fb2 files'''
+# TODO clean up and save only needed (sorry for this code:) )
+# -------------------------------------------
+#From calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head:
+# /src/calibre/ebooks/metadata/fb2.py
+#Based on revision 10897
+import os
+import datetime
+from functools import partial
+# from base64 import b64decode
+from lxml import etree
+#from calibre.utils.date import parse_date
+#from calibre import guess_type, guess_all_extensions, prints, force_unicode
+#from calibre.ebooks.metadata import MetaInformation, check_isbn
+#from calibre.ebooks.chardet import xml_to_unicode
+# -------------------------------------------
+def force_unicode(text):
+ if not isinstance(text, unicode):
+ uni = unicode(text, encoding='utf-8')
+ else:
+ uni = text
+ return uni
+# -------------------------------------------
+# from calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head:
+# /src/calibre/ebooks/chardet/__init__.py
+# Based on rev 10897
+import re, codecs
+ re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
+ re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
+ ]
+def strip_encoding_declarations(raw):
+ limit = 50*1024
+ for pat in ENCODING_PATS:
+ prefix = raw[:limit]
+ suffix = raw[limit:]
+ prefix = pat.sub('', prefix)
+ raw = prefix + suffix
+ return raw
+def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
+ resolve_entities=False, assume_utf8=False):
+ '''
+ Force conversion of byte string to unicode. Tries to look for XML/HTML
+ encoding declaration first, if not found uses the chardet library and
+ prints a warning if detection confidence is < 100%
+ @return: (unicode, encoding used)
+ '''
+ encoding = None
+ if not raw:
+ return u'', encoding
+ if not isinstance(raw, unicode):
+ if raw.startswith(codecs.BOM_UTF8):
+ raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
+ elif raw.startswith(codecs.BOM_UTF16_LE):
+ raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
+ elif raw.startswith(codecs.BOM_UTF16_BE):
+ raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
+ if not isinstance(raw, unicode):
+ for pat in ENCODING_PATS:
+ match = pat.search(raw)
+ if match:
+ encoding = match.group(1)
+ break
+ if encoding is None:
+ encoding = 'utf-8'
+ try:
+ if encoding.lower().strip() == 'macintosh':
+ encoding = 'mac-roman'
+ if encoding.lower().replace('_', '-').strip() in (
+ 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
+ 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
+ # Microsoft Word exports to HTML with encoding incorrectly set to
+ # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
+ encoding = 'gbk'
+ raw = raw.decode(encoding, 'replace')
+ except LookupError:
+ encoding = 'utf-8'
+ raw = raw.decode(encoding, 'replace')
+ if strip_encoding_pats:
+ raw = strip_encoding_declarations(raw)
+ #if resolve_entities:
+ # raw = substitute_entites(raw)
+ return raw, encoding
+# -------------------------------------------
+ 'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0',
+ 'xlink' : 'http://www.w3.org/1999/xlink' }
+XPath = partial(etree.XPath, namespaces=NAMESPACES)
+tostring = partial(etree.tostring, method='text', encoding=unicode)
+def get_metadata(stream):
+ ''' Return fb2 metadata as a L{MetaInformation} object '''
+ mi = type('lamdbaobject', (object,), {})()
+ root = _get_fbroot(stream)
+ if root is None:
+ return mi
+ book_title = _parse_book_title(root)
+ authors = _parse_authors(root)
+ # fallback for book_title
+ if book_title:
+ book_title = unicode(book_title)
+ else:
+# book_title = force_unicode(os.path.splitext(
+# os.path.basename(getattr(stream, 'name',
+# _('Unknown'))))[0])
+ book_title = force_unicode(getattr(stream, 'name'))
+ mi.book_title = book_title
+ mi.authors = authors
+ try:
+ _parse_comments(root, mi)
+ except:
+ pass
+ try:
+ _parse_tags(root, mi)
+ except:
+ pass
+ try:
+ _parse_series(root, mi)
+ except:
+ pass
+ try:
+ _parse_isbn(root, mi)
+ except:
+ pass
+ try:
+ _parse_publisher(root, mi)
+ except:
+ pass
+ try:
+ _parse_pubdate(root, mi)
+ except:
+ pass
+# try:
+# _parse_timestamp(root, mi)
+# except:
+# pass
+ try:
+ _parse_language(root, mi)
+ except:
+ pass
+ try:
+ _parse_cover_data(root,'cover.jpg',mi)
+ except:
+ pass
+ #_parse_uuid(root, mi)
+ #if DEBUG:
+ # prints(mi)
+ return mi
+def _parse_authors(root):
+ authors = []
+ # pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
+ # Those are fallbacks: <src-title-info>, <document-info>
+ for author_sec in ['title-info', 'src-title-info']:
+ for au in XPath('//fb2:%s/fb2:author'%author_sec)(root):
+ author = _parse_author(au)
+ if author:
+ authors.append(author)
+ break
+ # if no author so far
+ if not authors:
+ #authors.append(_('Unknown'))
+ authors.append('Unknown')
+ return authors
+def _parse_author(elm_author):
+ """ Returns a list of display author and sortable author"""
+ xp_templ = 'normalize-space(fb2:%s/text())'
+ author = XPath(xp_templ % 'first-name')(elm_author)
+ lname = XPath(xp_templ % 'last-name')(elm_author)
+ mname = XPath(xp_templ % 'middle-name')(elm_author)
+ if mname:
+ author = (author + ' ' + mname).strip()
+ if lname:
+ author = (author + ' ' + lname).strip()
+ # fallback to nickname
+ if not author:
+ nname = XPath(xp_templ % 'nickname')(elm_author)
+ if nname:
+ author = nname
+ return author
+def _parse_book_title(root):
+ # <title-info> has a priority. (actually <title-info> is mandatory)
+ # other are backup solution (sequence is important. other then in fb2-doc)
+ xp_ti = '//fb2:title-info/fb2:book-title/text()'
+ xp_pi = '//fb2:publish-info/fb2:book-title/text()'
+ xp_si = '//fb2:src-title-info/fb2:book-title/text()'
+ book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
+ return book_title
+#TODO add from calibre
+def _parse_cover_data(root, imgid, mi):
+ elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
+ if elm_binary:
+ mimetype = elm_binary[0].get('content-type', 'image/jpeg')
+ pic_data = elm_binary[0].text
+ mi.cover = pic_data
+def _parse_tags(root, mi):
+ # pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
+ # Those are fallbacks: <src-title-info>
+ for genre_sec in ['title-info', 'src-title-info']:
+ # -- i18n Translations-- ?
+ tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root)
+ if tags:
+ mi.tags = list(map(unicode, tags))
+ break
+def _parse_series(root, mi):
+ # calibri supports only 1 series: use the 1-st one
+ # pick up sequence but only from 1 secrion in prefered order
+ # except <src-title-info>
+ #TODO parse all
+ xp_ti = '//fb2:title-info/fb2:sequence[1]'
+ xp_pi = '//fb2:publish-info/fb2:sequence[1]'
+ elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root)
+ if elms_sequence:
+ mi.series = elms_sequence[0].get('name', None)
+ if mi.series:
+ mi.series_index = elms_sequence[0].get('number', None)
+def _parse_isbn(root, mi):
+ # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
+ isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
+ if isbn:
+ # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
+ if ',' in isbn:
+ isbn = isbn[:isbn.index(',')]
+ #TODO add from calibre
+ #if check_isbn(isbn):
+ mi.isbn = isbn
+def _parse_comments(root, mi):
+ # pick up annotation but only from 1 secrion <title-info>; fallback: <src-title-info>
+ for annotation_sec in ['title-info', 'src-title-info']:
+ elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root)
+ if elms_annotation:
+ mi.comments = tostring(elms_annotation[0])
+ # TODO: tags i18n, xslt?
+ break
+def _parse_publisher(root, mi):
+ publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root)
+ if publisher:
+ mi.publisher = publisher
+def _parse_pubdate(root, mi):
+ year = XPath('number(//fb2:publish-info/fb2:year/text())')(root)
+ if float.is_integer(year):
+ # only year is available, so use 1-st of Jan
+ mi.pubdate = datetime.date(int(year), 1, 1)
+def _parse_timestamp(root, mi):
+ #<date value="1996-12-03">03.12.1996</date>
+ xp ='//fb2:document-info/fb2:date/@value|'\
+ '//fb2:document-info/fb2:date/text()'
+ docdate = XPath('string(%s)' % xp)(root)
+ if docdate:
+ #TODO add from calibre
+ #mi.timestamp = parse_date(docdate)
+ mi.timestamp = docdate
+def _parse_language(root, mi):
+ language = XPath('string(//fb2:title-info/fb2:lang/text())')(root)
+ if language:
+ mi.language = language
+ mi.languages = [ language ]
+def _parse_uuid(root, mi):
+ uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root)
+ if uuid:
+ mi.uuid = uuid
+def _get_fbroot(stream):
+ parser = etree.XMLParser(recover=True, no_network=True)
+ raw = stream.read()
+ raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
+ root = etree.fromstring(raw, parser=parser)
+ return root