from __future__ import with_statement __license__ = 'GPL v3' __copyright__ = '2011, Roman Mukhin , '\ '2008, Anatoly Shipitsin ' '''Read meta information from fb2 files''' # TODO clean up and save only needed (sorry for this code:) ) # ------------------------------------------- #From calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head: # /src/calibre/ebooks/metadata/fb2.py #Based on revision 10897 import os import datetime from functools import partial # from base64 import b64decode from lxml import etree #from calibre.utils.date import parse_date #from calibre import guess_type, guess_all_extensions, prints, force_unicode #from calibre.ebooks.metadata import MetaInformation, check_isbn #from calibre.ebooks.chardet import xml_to_unicode # ------------------------------------------- def force_unicode(text): if not isinstance(text, unicode): uni = unicode(text, encoding='utf-8') else: uni = text return uni # ------------------------------------------- # from calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head: # /src/calibre/ebooks/chardet/__init__.py # Based on rev 10897 import re, codecs ENCODING_PATS = [ re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE), re.compile(r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''', re.IGNORECASE), ] def strip_encoding_declarations(raw): limit = 50*1024 for pat in ENCODING_PATS: prefix = raw[:limit] suffix = raw[limit:] prefix = pat.sub('', prefix) raw = prefix + suffix return raw def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entities=False, assume_utf8=False): ''' Force conversion of byte string to unicode. Tries to look for XML/HTML encoding declaration first, if not found uses the chardet library and prints a warning if detection confidence is < 100% @return: (unicode, encoding used) ''' encoding = None if not raw: return u'', encoding if not isinstance(raw, unicode): if raw.startswith(codecs.BOM_UTF8): raw, encoding = raw.decode('utf-8')[1:], 'utf-8' elif raw.startswith(codecs.BOM_UTF16_LE): raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le' elif raw.startswith(codecs.BOM_UTF16_BE): raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be' if not isinstance(raw, unicode): for pat in ENCODING_PATS: match = pat.search(raw) if match: encoding = match.group(1) break if encoding is None: encoding = 'utf-8' try: if encoding.lower().strip() == 'macintosh': encoding = 'mac-roman' if encoding.lower().replace('_', '-').strip() in ( 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. encoding = 'gbk' raw = raw.decode(encoding, 'replace') except LookupError: encoding = 'utf-8' raw = raw.decode(encoding, 'replace') if strip_encoding_pats: raw = strip_encoding_declarations(raw) #if resolve_entities: # raw = substitute_entites(raw) return raw, encoding # ------------------------------------------- NAMESPACES = { 'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0', 'xlink' : 'http://www.w3.org/1999/xlink' } XPath = partial(etree.XPath, namespaces=NAMESPACES) tostring = partial(etree.tostring, method='text', encoding=unicode) def get_metadata(stream): ''' Return fb2 metadata as a L{MetaInformation} object ''' mi = type('lamdbaobject', (object,), {})() root = _get_fbroot(stream) if root is None: return mi book_title = _parse_book_title(root) authors = _parse_authors(root) # fallback for book_title if book_title: book_title = unicode(book_title) else: # book_title = force_unicode(os.path.splitext( # os.path.basename(getattr(stream, 'name', # _('Unknown'))))[0]) book_title = force_unicode(getattr(stream, 'name')) mi.book_title = book_title mi.authors = authors try: _parse_comments(root, mi) except: pass try: _parse_tags(root, mi) except: pass try: _parse_series(root, mi) except: pass try: _parse_isbn(root, mi) except: pass try: _parse_publisher(root, mi) except: pass try: _parse_pubdate(root, mi) except: pass # try: # _parse_timestamp(root, mi) # except: # pass try: _parse_language(root, mi) except: pass try: _parse_cover_data(root,'cover.jpg',mi) except: pass #_parse_uuid(root, mi) #if DEBUG: # prints(mi) return mi def _parse_authors(root): authors = [] # pick up authors but only from 1 secrion ; otherwise it is not consistent! # Those are fallbacks: , for author_sec in ['title-info', 'src-title-info']: for au in XPath('//fb2:%s/fb2:author'%author_sec)(root): author = _parse_author(au) if author: authors.append(author) break # if no author so far if not authors: #authors.append(_('Unknown')) authors.append('Unknown') return authors def _parse_author(elm_author): """ Returns a list of display author and sortable author""" xp_templ = 'normalize-space(fb2:%s/text())' author = XPath(xp_templ % 'first-name')(elm_author) lname = XPath(xp_templ % 'last-name')(elm_author) mname = XPath(xp_templ % 'middle-name')(elm_author) if mname: author = (author + ' ' + mname).strip() if lname: author = (author + ' ' + lname).strip() # fallback to nickname if not author: nname = XPath(xp_templ % 'nickname')(elm_author) if nname: author = nname return author def _parse_book_title(root): # has a priority. (actually is mandatory) # other are backup solution (sequence is important. other then in fb2-doc) xp_ti = '//fb2:title-info/fb2:book-title/text()' xp_pi = '//fb2:publish-info/fb2:book-title/text()' xp_si = '//fb2:src-title-info/fb2:book-title/text()' book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root) return book_title #TODO add from calibre def _parse_cover_data(root, imgid, mi): elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root) if elm_binary: mimetype = elm_binary[0].get('content-type', 'image/jpeg') pic_data = elm_binary[0].text mi.cover = pic_data def _parse_tags(root, mi): # pick up genre but only from 1 secrion ; otherwise it is not consistent! # Those are fallbacks: for genre_sec in ['title-info', 'src-title-info']: # -- i18n Translations-- ? tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root) if tags: mi.tags = list(map(unicode, tags)) break def _parse_series(root, mi): # calibri supports only 1 series: use the 1-st one # pick up sequence but only from 1 secrion in prefered order # except #TODO parse all xp_ti = '//fb2:title-info/fb2:sequence[1]' xp_pi = '//fb2:publish-info/fb2:sequence[1]' elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root) if elms_sequence: mi.series = elms_sequence[0].get('name', None) if mi.series: mi.series_index = elms_sequence[0].get('number', None) def _parse_isbn(root, mi): # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root) if isbn: # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case if ',' in isbn: isbn = isbn[:isbn.index(',')] #TODO add from calibre #if check_isbn(isbn): mi.isbn = isbn def _parse_comments(root, mi): # pick up annotation but only from 1 secrion ; fallback: for annotation_sec in ['title-info', 'src-title-info']: elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root) if elms_annotation: mi.comments = tostring(elms_annotation[0]) # TODO: tags i18n, xslt? break def _parse_publisher(root, mi): publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root) if publisher: mi.publisher = publisher def _parse_pubdate(root, mi): year = XPath('number(//fb2:publish-info/fb2:year/text())')(root) if float.is_integer(year): # only year is available, so use 1-st of Jan mi.pubdate = datetime.date(int(year), 1, 1) def _parse_timestamp(root, mi): #03.12.1996 xp ='//fb2:document-info/fb2:date/@value|'\ '//fb2:document-info/fb2:date/text()' docdate = XPath('string(%s)' % xp)(root) if docdate: #TODO add from calibre #mi.timestamp = parse_date(docdate) mi.timestamp = docdate def _parse_language(root, mi): language = XPath('string(//fb2:title-info/fb2:lang/text())')(root) if language: mi.language = language mi.languages = [ language ] def _parse_uuid(root, mi): uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root) if uuid: mi.uuid = uuid def _get_fbroot(stream): parser = etree.XMLParser(recover=True, no_network=True) raw = stream.read() raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] root = etree.fromstring(raw, parser=parser) return root