fb2_meta.py

   1 from __future__ import with_statement
   2 __license__   = 'GPL v3'
   3 __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
   4                 '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
   5 '''Read meta information from fb2 files'''
   6
   7
   8 # TODO clean up and save only needed (sorry for this code:) )
   9
  10 # -------------------------------------------
  11
  12 #From calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head:
  13 #             /src/calibre/ebooks/metadata/fb2.py
  14 #Based on revision 10897
  15
  16 import os
  17 import datetime
  18 from functools import partial
  19 # from base64 import b64decode
  20 from lxml import etree
  21 #from calibre.utils.date import parse_date
  22 #from calibre import guess_type, guess_all_extensions, prints, force_unicode
  23 #from calibre.ebooks.metadata import MetaInformation, check_isbn
  24 #from calibre.ebooks.chardet import xml_to_unicode
  25
  26
  27 # -------------------------------------------
  28
  29 def force_unicode(text):
  30     if not isinstance(text, unicode):
  31         uni = unicode(text, encoding='utf-8')
  32     else:
  33         uni = text
  34     return uni
  35
  36 # -------------------------------------------
  37 # from calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head:
  38 #              /src/calibre/ebooks/chardet/__init__.py
  39 # Based on rev 10897
  40
  41 import re, codecs
  42 ENCODING_PATS = [
  43                  re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
  44                             re.IGNORECASE),
  45                  re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
  46                             re.IGNORECASE),
  47                  ]
  48
  49 def strip_encoding_declarations(raw):
  50     limit = 50*1024
  51     for pat in ENCODING_PATS:
  52         prefix = raw[:limit]
  53         suffix = raw[limit:]
  54         prefix = pat.sub('', prefix)
  55         raw = prefix + suffix
  56     return raw
  57
  58 def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
  59                    resolve_entities=False, assume_utf8=False):
  60     '''
  61     Force conversion of byte string to unicode. Tries to look for XML/HTML
  62     encoding declaration first, if not found uses the chardet library and
  63     prints a warning if detection confidence is < 100%
  64     @return: (unicode, encoding used)
  65     '''
  66     encoding = None
  67     if not raw:
  68         return u'', encoding
  69     if not isinstance(raw, unicode):
  70         if raw.startswith(codecs.BOM_UTF8):
  71             raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
  72         elif raw.startswith(codecs.BOM_UTF16_LE):
  73             raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
  74         elif raw.startswith(codecs.BOM_UTF16_BE):
  75             raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
  76     if not isinstance(raw, unicode):
  77         for pat in ENCODING_PATS:
  78             match = pat.search(raw)
  79             if match:
  80                 encoding = match.group(1)
  81                 break
  82         if encoding is None:
  83             encoding = 'utf-8'
  84
  85         try:
  86             if encoding.lower().strip() == 'macintosh':
  87                 encoding = 'mac-roman'
  88             if encoding.lower().replace('_', '-').strip() in (
  89                     'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
  90                     'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
  91                 # Microsoft Word exports to HTML with encoding incorrectly set to
  92                 # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
  93                 encoding = 'gbk'
  94             raw = raw.decode(encoding, 'replace')
  95         except LookupError:
  96             encoding = 'utf-8'
  97             raw = raw.decode(encoding, 'replace')
  98
  99     if strip_encoding_pats:
 100         raw = strip_encoding_declarations(raw)
 101     #if resolve_entities:
 102     #    raw = substitute_entites(raw)
 103
 104     return raw, encoding
 105
 106
 107 # -------------------------------------------
 108
 109 NAMESPACES = {
 110     'fb2'   :   'http://www.gribuser.ru/xml/fictionbook/2.0',
 111     'xlink' :   'http://www.w3.org/1999/xlink'  }
 112
 113 XPath = partial(etree.XPath, namespaces=NAMESPACES)
 114 tostring = partial(etree.tostring, method='text', encoding=unicode)
 115
 116 def get_metadata(stream):
 117     ''' Return fb2 metadata as a L{MetaInformation} object '''
 118
 119     mi = type('lamdbaobject', (object,), {})()
 120
 121     root = _get_fbroot(stream)
 122     if root is None:
 123         return mi
 124
 125     book_title = _parse_book_title(root)
 126     authors = _parse_authors(root)
 127
 128     # fallback for book_title
 129     if book_title:
 130         book_title = unicode(book_title)
 131     else:
 132 #        book_title = force_unicode(os.path.splitext(
 133 #            os.path.basename(getattr(stream, 'name',
 134 #                _('Unknown'))))[0])
 135         book_title = force_unicode(getattr(stream, 'name'))
 136     mi.book_title = book_title
 137     mi.authors = authors
 138
 139     try:
 140         _parse_comments(root, mi)
 141     except:
 142         pass
 143     try:
 144         _parse_tags(root, mi)
 145     except:
 146         pass
 147     try:
 148         _parse_series(root, mi)
 149     except:
 150         pass
 151     try:
 152         _parse_isbn(root, mi)
 153     except:
 154         pass
 155     try:
 156         _parse_publisher(root, mi)
 157     except:
 158         pass
 159     try:
 160         _parse_pubdate(root, mi)
 161     except:
 162         pass
 163 #    try:
 164 #        _parse_timestamp(root, mi)
 165 #    except:
 166 #        pass
 167
 168     try:
 169         _parse_language(root, mi)
 170     except:
 171         pass
 172
 173     try:
 174         _parse_cover_data(root,'cover.jpg',mi)
 175     except:
 176         pass
 177     #_parse_uuid(root, mi)
 178
 179     #if DEBUG:
 180     #   prints(mi)
 181     return mi
 182
 183 def _parse_authors(root):
 184     authors = []
 185     # pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
 186     # Those are fallbacks: <src-title-info>, <document-info>
 187     for author_sec in ['title-info', 'src-title-info']:
 188         for au in XPath('//fb2:%s/fb2:author'%author_sec)(root):
 189             author = _parse_author(au)
 190             if author:
 191                 authors.append(author)
 192                 break
 193
 194     # if no author so far
 195     if not authors:
 196         #authors.append(_('Unknown'))
 197         authors.append('Unknown')
 198
 199     return authors
 200
 201 def _parse_author(elm_author):
 202     """ Returns a list of display author and sortable author"""
 203
 204     xp_templ = 'normalize-space(fb2:%s/text())'
 205
 206     author = XPath(xp_templ % 'first-name')(elm_author)
 207     lname = XPath(xp_templ % 'last-name')(elm_author)
 208     mname = XPath(xp_templ % 'middle-name')(elm_author)
 209
 210     if mname:
 211         author = (author + ' ' + mname).strip()
 212     if lname:
 213         author = (author + ' ' + lname).strip()
 214
 215     # fallback to nickname
 216     if not author:
 217         nname = XPath(xp_templ % 'nickname')(elm_author)
 218         if nname:
 219             author = nname
 220
 221     return author
 222
 223
 224 def _parse_book_title(root):
 225     # <title-info> has a priority.   (actually <title-info>  is mandatory)
 226     # other are backup solution (sequence is important. other then in fb2-doc)
 227     xp_ti = '//fb2:title-info/fb2:book-title/text()'
 228     xp_pi = '//fb2:publish-info/fb2:book-title/text()'
 229     xp_si = '//fb2:src-title-info/fb2:book-title/text()'
 230     book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
 231
 232     return book_title
 233
 234 #TODO add from calibre
 235 def _parse_cover_data(root, imgid, mi):
 236     elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
 237     if elm_binary:
 238         mimetype = elm_binary[0].get('content-type', 'image/jpeg')
 239         pic_data = elm_binary[0].text
 240         mi.cover = pic_data
 241
 242 def _parse_tags(root, mi):
 243     # pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
 244     # Those are fallbacks: <src-title-info>
 245     for genre_sec in ['title-info', 'src-title-info']:
 246         # -- i18n Translations-- ?
 247         tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root)
 248         if tags:
 249             mi.tags = list(map(unicode, tags))
 250             break
 251
 252 def _parse_series(root, mi):
 253     # calibri supports only 1 series: use the 1-st one
 254     # pick up sequence but only from 1 secrion in prefered order
 255     # except <src-title-info>
 256
 257     #TODO parse all
 258     xp_ti = '//fb2:title-info/fb2:sequence[1]'
 259     xp_pi = '//fb2:publish-info/fb2:sequence[1]'
 260
 261     elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root)
 262     if elms_sequence:
 263         mi.series = elms_sequence[0].get('name', None)
 264         if mi.series:
 265             mi.series_index = elms_sequence[0].get('number', None)
 266
 267 def _parse_isbn(root, mi):
 268     # some people try to put several isbn in this field, but it is not allowed.  try to stick to the 1-st one in this case
 269     isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
 270     if isbn:
 271         # some people try to put several isbn in this field, but it is not allowed.  try to stick to the 1-st one in this case
 272         if ',' in isbn:
 273             isbn = isbn[:isbn.index(',')]
 274
 275         #TODO add from calibre
 276         #if check_isbn(isbn):
 277         mi.isbn = isbn
 278
 279 def _parse_comments(root, mi):
 280     # pick up annotation but only from 1 secrion <title-info>;  fallback: <src-title-info>
 281     for annotation_sec in ['title-info', 'src-title-info']:
 282         elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root)
 283         if elms_annotation:
 284             mi.comments = tostring(elms_annotation[0])
 285             # TODO: tags i18n, xslt?
 286             break
 287
 288 def _parse_publisher(root, mi):
 289     publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root)
 290     if publisher:
 291         mi.publisher = publisher
 292
 293 def _parse_pubdate(root, mi):
 294     year = XPath('number(//fb2:publish-info/fb2:year/text())')(root)
 295     if float.is_integer(year):
 296         # only year is available, so use 1-st of Jan
 297         mi.pubdate = datetime.date(int(year), 1, 1)
 298
 299 def _parse_timestamp(root, mi):
 300     #<date value="1996-12-03">03.12.1996</date>
 301     xp ='//fb2:document-info/fb2:date/@value|'\
 302         '//fb2:document-info/fb2:date/text()'
 303     docdate = XPath('string(%s)' % xp)(root)
 304     if docdate:
 305         #TODO add from calibre
 306         #mi.timestamp = parse_date(docdate)
 307         mi.timestamp = docdate
 308
 309 def _parse_language(root, mi):
 310     language = XPath('string(//fb2:title-info/fb2:lang/text())')(root)
 311     if language:
 312         mi.language = language
 313         mi.languages = [ language ]
 314
 315 def _parse_uuid(root, mi):
 316     uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root)
 317     if uuid:
 318         mi.uuid = uuid
 319
 320 def _get_fbroot(stream):
 321     parser = etree.XMLParser(recover=True, no_network=True)
 322     raw = stream.read()
 323     raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
 324     root = etree.fromstring(raw, parser=parser)
 325     return root