1 from __future__ import with_statement
3 __copyright__ = '2011, Roman Mukhin <ramses_ru at hotmail.com>, '\
4 '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
5 '''Read meta information from fb2 files'''
8 # TODO clean up and save only needed (sorry for this code:) )
10 # -------------------------------------------
12 #From calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head:
13 # /src/calibre/ebooks/metadata/fb2.py
14 #Based on revision 10897
18 from functools import partial
19 # from base64 import b64decode
20 from lxml import etree
21 #from calibre.utils.date import parse_date
22 #from calibre import guess_type, guess_all_extensions, prints, force_unicode
23 #from calibre.ebooks.metadata import MetaInformation, check_isbn
24 #from calibre.ebooks.chardet import xml_to_unicode
27 # -------------------------------------------
29 def force_unicode(text):
30 if not isinstance(text, unicode):
31 uni = unicode(text, encoding='utf-8')
36 # -------------------------------------------
37 # from calibre http://bazaar.launchpad.net/~kovid/calibre/trunk/view/head:
38 # /src/calibre/ebooks/chardet/__init__.py
43 re.compile(r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
45 re.compile(r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>''',
49 def strip_encoding_declarations(raw):
51 for pat in ENCODING_PATS:
54 prefix = pat.sub('', prefix)
58 def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
59 resolve_entities=False, assume_utf8=False):
61 Force conversion of byte string to unicode. Tries to look for XML/HTML
62 encoding declaration first, if not found uses the chardet library and
63 prints a warning if detection confidence is < 100%
64 @return: (unicode, encoding used)
69 if not isinstance(raw, unicode):
70 if raw.startswith(codecs.BOM_UTF8):
71 raw, encoding = raw.decode('utf-8')[1:], 'utf-8'
72 elif raw.startswith(codecs.BOM_UTF16_LE):
73 raw, encoding = raw.decode('utf-16-le')[1:], 'utf-16-le'
74 elif raw.startswith(codecs.BOM_UTF16_BE):
75 raw, encoding = raw.decode('utf-16-be')[1:], 'utf-16-be'
76 if not isinstance(raw, unicode):
77 for pat in ENCODING_PATS:
78 match = pat.search(raw)
80 encoding = match.group(1)
86 if encoding.lower().strip() == 'macintosh':
87 encoding = 'mac-roman'
88 if encoding.lower().replace('_', '-').strip() in (
89 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
90 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
91 # Microsoft Word exports to HTML with encoding incorrectly set to
92 # gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
94 raw = raw.decode(encoding, 'replace')
97 raw = raw.decode(encoding, 'replace')
99 if strip_encoding_pats:
100 raw = strip_encoding_declarations(raw)
101 #if resolve_entities:
102 # raw = substitute_entites(raw)
107 # -------------------------------------------
110 'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0',
111 'xlink' : 'http://www.w3.org/1999/xlink' }
113 XPath = partial(etree.XPath, namespaces=NAMESPACES)
114 tostring = partial(etree.tostring, method='text', encoding=unicode)
116 def get_metadata(stream):
117 ''' Return fb2 metadata as a L{MetaInformation} object '''
119 mi = type('lamdbaobject', (object,), {})()
121 root = _get_fbroot(stream)
125 book_title = _parse_book_title(root)
126 authors = _parse_authors(root)
128 # fallback for book_title
130 book_title = unicode(book_title)
132 # book_title = force_unicode(os.path.splitext(
133 # os.path.basename(getattr(stream, 'name',
134 # _('Unknown'))))[0])
135 book_title = force_unicode(getattr(stream, 'name'))
136 mi.book_title = book_title
140 _parse_comments(root, mi)
144 _parse_tags(root, mi)
148 _parse_series(root, mi)
152 _parse_isbn(root, mi)
156 _parse_publisher(root, mi)
160 _parse_pubdate(root, mi)
164 # _parse_timestamp(root, mi)
169 _parse_language(root, mi)
174 _parse_cover_data(root,'cover.jpg',mi)
177 #_parse_uuid(root, mi)
183 def _parse_authors(root):
185 # pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent!
186 # Those are fallbacks: <src-title-info>, <document-info>
187 for author_sec in ['title-info', 'src-title-info']:
188 for au in XPath('//fb2:%s/fb2:author'%author_sec)(root):
189 author = _parse_author(au)
191 authors.append(author)
194 # if no author so far
196 #authors.append(_('Unknown'))
197 authors.append('Unknown')
201 def _parse_author(elm_author):
202 """ Returns a list of display author and sortable author"""
204 xp_templ = 'normalize-space(fb2:%s/text())'
206 author = XPath(xp_templ % 'first-name')(elm_author)
207 lname = XPath(xp_templ % 'last-name')(elm_author)
208 mname = XPath(xp_templ % 'middle-name')(elm_author)
211 author = (author + ' ' + mname).strip()
213 author = (author + ' ' + lname).strip()
215 # fallback to nickname
217 nname = XPath(xp_templ % 'nickname')(elm_author)
224 def _parse_book_title(root):
225 # <title-info> has a priority. (actually <title-info> is mandatory)
226 # other are backup solution (sequence is important. other then in fb2-doc)
227 xp_ti = '//fb2:title-info/fb2:book-title/text()'
228 xp_pi = '//fb2:publish-info/fb2:book-title/text()'
229 xp_si = '//fb2:src-title-info/fb2:book-title/text()'
230 book_title = XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root)
234 #TODO add from calibre
235 def _parse_cover_data(root, imgid, mi):
236 elm_binary = XPath('//fb2:binary[@id="%s"]'%imgid)(root)
238 mimetype = elm_binary[0].get('content-type', 'image/jpeg')
239 pic_data = elm_binary[0].text
242 def _parse_tags(root, mi):
243 # pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent!
244 # Those are fallbacks: <src-title-info>
245 for genre_sec in ['title-info', 'src-title-info']:
246 # -- i18n Translations-- ?
247 tags = XPath('//fb2:%s/fb2:genre/text()' % genre_sec)(root)
249 mi.tags = list(map(unicode, tags))
252 def _parse_series(root, mi):
253 # calibri supports only 1 series: use the 1-st one
254 # pick up sequence but only from 1 secrion in prefered order
255 # except <src-title-info>
258 xp_ti = '//fb2:title-info/fb2:sequence[1]'
259 xp_pi = '//fb2:publish-info/fb2:sequence[1]'
261 elms_sequence = XPath('%s|%s' % (xp_ti, xp_pi))(root)
263 mi.series = elms_sequence[0].get('name', None)
265 mi.series_index = elms_sequence[0].get('number', None)
267 def _parse_isbn(root, mi):
268 # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
269 isbn = XPath('normalize-space(//fb2:publish-info/fb2:isbn/text())')(root)
271 # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case
273 isbn = isbn[:isbn.index(',')]
275 #TODO add from calibre
276 #if check_isbn(isbn):
279 def _parse_comments(root, mi):
280 # pick up annotation but only from 1 secrion <title-info>; fallback: <src-title-info>
281 for annotation_sec in ['title-info', 'src-title-info']:
282 elms_annotation = XPath('//fb2:%s/fb2:annotation' % annotation_sec)(root)
284 mi.comments = tostring(elms_annotation[0])
285 # TODO: tags i18n, xslt?
288 def _parse_publisher(root, mi):
289 publisher = XPath('string(//fb2:publish-info/fb2:publisher/text())')(root)
291 mi.publisher = publisher
293 def _parse_pubdate(root, mi):
294 year = XPath('number(//fb2:publish-info/fb2:year/text())')(root)
295 if float.is_integer(year):
296 # only year is available, so use 1-st of Jan
297 mi.pubdate = datetime.date(int(year), 1, 1)
299 def _parse_timestamp(root, mi):
300 #<date value="1996-12-03">03.12.1996</date>
301 xp ='//fb2:document-info/fb2:date/@value|'\
302 '//fb2:document-info/fb2:date/text()'
303 docdate = XPath('string(%s)' % xp)(root)
305 #TODO add from calibre
306 #mi.timestamp = parse_date(docdate)
307 mi.timestamp = docdate
309 def _parse_language(root, mi):
310 language = XPath('string(//fb2:title-info/fb2:lang/text())')(root)
312 mi.language = language
313 mi.languages = [ language ]
315 def _parse_uuid(root, mi):
316 uuid = XPath('normalize-space(//document-info/fb2:id/text())')(root)
320 def _get_fbroot(stream):
321 parser = etree.XMLParser(recover=True, no_network=True)
323 raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
324 root = etree.fromstring(raw, parser=parser)