Commit 7f4fb2a3 authored by Dmitry Shelepnev's avatar Dmitry Shelepnev
Browse files

Add updated inpxparser1 and sopdscan1 for analise

parent 2a25702b
Loading
Loading
Loading
Loading
+115 −0
Original line number Diff line number Diff line
'''
Created on 14 нояб. 2016 г.

@author: Shelepnev, Dmitry
'''

# -*- coding: utf-8 -*-
import os
import zipfile
#from opds_catalog import settings
from constance import config

sAuthor = 'AUTHOR'
sGenre  = 'GENRE'
sTitle  = 'TITLE'
sSeries = 'SERIES'
sSerNo  = 'SERNO'
sFile   = 'FILE'
sSize   = 'SIZE'
sLibId  = 'LIBID'
sDel    = 'DEL'
sExt    = 'EXT'
sDate   = 'DATE'
sLang   = 'LANG'
sInsNo  = 'INSNO'
sFolder = 'FOLDER'
sLibRate= 'LIBRATE'
sKeyWords='KEYWORDS'


class Inpx:
    def __init__(self, inpx_file, append_callback, inpskip_callback = lambda inpx,inp,size:0):
        self.inpx_file = inpx_file
        self.inpx_catalog = os.path.dirname(inpx_file)
        self.inpx_structure = False
        self.inpx_format = []
        self.inpx_archive = False
        self.inpx_arch_fnames = []
        self.inpx_encoding = 'utf-8'
        self.inpx_separator = b'\x04'
        self.inpx_itemseparator = ':'
        self.append_callback = append_callback
        self.inpskip_callback = inpskip_callback
        self.TEST_ZIP = config.SOPDS_INPX_TEST_ZIP
        self.TEST_FILES = config.SOPDS_INPX_TEST_FILES
        self.error = 0       
        
    def parse(self):
        finpx = zipfile.ZipFile(self.inpx_file, "r")
        filelist = finpx.namelist()
        # здесь читаем формат файлов inp, если есть, если нет, то по умолчанию
        if 'structure.info' in filelist:
            self.inpx_structure = True
            fsds = finpx.open('structure.info')
            fsb = str(fsds.read(),'utf-8')
            self.inpx_format = fsb.split(';')
        else:
            self.inpx_format = [sAuthor,sGenre,sTitle,sSeries,sSerNo,sFile,sSize,sLibId,sDel,sExt,sDate,sLang]
        
        # здесь читаем список архивов в коллекции, если указано явно
        if 'archives.info' in filelist:
            self.inpx_archive = True
            self.inpx_arch_fnames = finpx.open('archives.info').readlines()
        # эту информацию надо как-то использовать, чтобы протестировать наличие zip
        
        for inp_file in filelist:
            (inp_name,inp_ext) = os.path.splitext(inp_file)
            
            # Если файл не INP то пропускаем
            if inp_ext.upper() != '.INP':
                continue
            
            zip_file_name = os.path.join(self.inpx_catalog,"%s%s"%(inp_name,'.zip'))
            # Если решили проверять на наличие ZIP файла или книги в ZIP, а самого ZIP файла нет - то пропускаем обработку всего ZIP файла
            if (self.TEST_ZIP or self.TEST_FILES) and not os.path.isfile(zip_file_name):
                continue
            if self.inpskip_callback(self.inpx_file, inp_name,finpx.getinfo(inp_file).file_size):
                continue             
            
            # Если будем проверять наличие файлов в ZIP то однократно получаем список файлов в обрабатываемом ZIP 
            if self.TEST_FILES: 
                testzip = zipfile.ZipFile(zip_file_name, "r")
                testzip_namelist = testzip.namelist()
            else:
                testzip_namelist = []
            
            finp = finpx.open(inp_file)
            for line in finp:
                meta_list = line.split(self.inpx_separator)
                meta_data = {}
                for idx, key in enumerate(self.inpx_format):
                    try:          
                        if key in [sAuthor,sGenre,sSeries]:
                            meta_data[key] = meta_list[idx].decode(self.inpx_encoding).split(self.inpx_itemseparator)
                            if '' in  meta_data[key]:
                                meta_data[key].remove('')
                        else: 
                            meta_data[key] = meta_list[idx].decode(self.inpx_encoding)
                    except IndexError:
                        meta_data[key] = '' 
                                  
                # Если книга помечена как удаленная в INP, то пропускаем вызов callback   
                if not (meta_data[sDel].strip() in ['','0']):
                    continue          
                
                # Если нужно выполнить проверку книги в ZIP, а ее там не оказалось, то пропускаем вызов callback            
                if self.TEST_FILES:      
                    if not "%s.%s"(meta_data[sFile],meta_data[sExt]) in testzip_namelist:
                        continue 
                            
                self.append_callback(self.inpx_file, inp_name, meta_data)
				    
            finp.close()
        finpx.close()
        
+258 −0
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-

import os
import time
import datetime
import logging
import re

from book_tools.format import create_bookfile
from book_tools.format.util import strip_symbols

from django.db import transaction
from django.utils.translation import ugettext as _

from opds_catalog import fb2parse, opdsdb
from opds_catalog import inpx_parser
import opds_catalog.zipf as zipfile

from constance import config

class opdsScanner:
    def __init__(self, logger=None):
        self.fb2parser=None
        self.init_parser()

        if logger:
            self.logger = logger
        else:
            self.logger = logging.getLogger('')
            self.logger.setLevel(logging.CRITICAL)
        self.init_stats()

    def init_stats(self):
        self.t1=datetime.timedelta(seconds=time.time())
        self.t2=self.t1
        self.t3=self.t1
        self.books_added = 0
        self.books_skipped = 0
        self.books_deleted = 0
        self.arch_scanned = 0
        self.arch_skipped = 0
        self.bad_archives = 0
        self.bad_books = 0
        self.books_in_archives = 0

    def init_parser(self):
        self.fb2parser=fb2parse.fb2parser(False)

    def log_options(self):
        self.logger.info(' ***** Starting sopds-scan...')
        self.logger.debug('OPTIONS SET')
        if config.SOPDS_ROOT_LIB!=None:       self.logger.debug('root_lib = %s'%config.SOPDS_ROOT_LIB)
        if config.SOPDS_FB2TOEPUB!=None: self.logger.debug('fb2toepub = %s'%config.SOPDS_FB2TOEPUB)
        if config.SOPDS_FB2TOMOBI!=None: self.logger.debug('fb2tomobi = %s'%config.SOPDS_FB2TOMOBI)
        if config.SOPDS_TEMP_DIR!=None:       self.logger.debug('temp_dir = %s'%config.SOPDS_TEMP_DIR)
        if config.SOPDS_FB2SAX != None:       self.logger.info('FB2SAX = %s'%config.SOPDS_FB2SAX)


    def log_stats(self):
        self.t2=datetime.timedelta(seconds=time.time())
        self.logger.info('Books added      : '+str(self.books_added))
        self.logger.info('Books skipped    : '+str(self.books_skipped))
        self.logger.info('Bad books        : '+str(self.bad_books))
        if config.SOPDS_DELETE_LOGICAL:
            self.logger.info('Books deleted    : '+str(self.books_deleted))
        else:
            self.logger.info('Books DB entries deleted : '+str(self.books_deleted))
        self.logger.info('Books in archives: '+str(self.books_in_archives))
        self.logger.info('Archives scanned : '+str(self.arch_scanned))
        self.logger.info('Archives skipped : '+str(self.arch_skipped))
        self.logger.info('Bad archives     : '+str(self.bad_archives))

        t=self.t2-self.t1
        seconds=t.seconds%60
        minutes=((t.seconds-seconds)//60)%60
        hours=t.seconds//3600
        self.logger.info('Time estimated:'+str(hours)+' hours, '+str(minutes)+' minutes, '+str(seconds)+' seconds.')

    def scan_all(self):
        self.init_stats()
        self.log_options()
        self.inp_cat = None
        self.zip_file = None
        self.rel_path = None     
                    
        opdsdb.avail_check_prepare()
            
        for full_path, dirs, files in os.walk(config.SOPDS_ROOT_LIB, followlinks=True):
            # Если разрешена обработка inpx, то при нахождении inpx обрабатываем его и прекращаем обработку текущего каталога
            if config.SOPDS_INPX_ENABLE:
                inpx_files = [inpx for inpx in files if re.match('.*(.inpx|.INPX)$', inpx)]
                # Пропускаем обработку файлов в текущем каталоге, если найдены inpx
                if inpx_files:
                    for inpx_file in inpx_files:
                        file = os.path.join(full_path, inpx_file)
                        self.processinpx(inpx_file, full_path, file)     # имя inpx, путь к каталогу рут-либ, полный путь с именем inpx
                    continue
                
            for name in files:
                file=os.path.join(full_path,name)
                (n,e)=os.path.splitext(name)
                if (e.lower() == '.zip'):
                    if config.SOPDS_ZIPSCAN:
                        self.processzip(name,full_path,file)
                else:
                    file_size=os.path.getsize(file)
                    self.processfile(name,full_path,file,None,0,file_size)                   

        #if config.SOPDS_DELETE_LOGICAL:
        #    self.books_deleted=opdsdb.books_del_logical()
        #else:
        #    self.books_deleted=opdsdb.books_del_phisical()
            
        self.books_deleted=opdsdb.books_del_phisical()
        
        self.log_stats()

    def inpskip_callback(self, inpx, inp_name, inp_size):
        
        self.zip_file = os.path.join(inpx,"%s%s"%(inp_name,'.zip'))
        self.rel_path=os.path.relpath(self.zip_file,config.SOPDS_ROOT_LIB)            
        
        if opdsdb.arc_skip(self.rel_path,inp_size):
            self.logger.info('Skip ZIP for INP archive '+self.zip_file+'. Not changed.')
            result = 1               
        else:    
            self.logger.info('Start process ZIP for INP archive = '+self.zip_file) 
            self.inp_cat = opdsdb.addcattree(self.rel_path, opdsdb.CAT_INP, inp_size)
            result = 0
         
        return result
                
    def inpx_callback(self, inpx, inp, meta_data):          
                 
        name = "%s.%s"%(meta_data[inpx_parser.sFile],meta_data[inpx_parser.sExt])
        
        lang=meta_data[inpx_parser.sLang].strip(strip_symbols)
        title=meta_data[inpx_parser.sTitle].strip(strip_symbols)
        annotation=''
        docdate=meta_data[inpx_parser.sDate].strip(strip_symbols)
        
        inpx_path, inp_name_zip = os.path.split(self.rel_path)
        self.rel_path = inpx_path+"/"+meta_data[inpx_parser.sFolder]
        
        book=opdsdb.addbook(name,self.rel_path,self.inp_cat,meta_data[inpx_parser.sExt],title,annotation,docdate,lang,meta_data[inpx_parser.sSize],opdsdb.CAT_INP)
        self.books_added+=1
        self.books_in_archives+=1
        self.logger.debug("Book "+self.rel_path+"/"+name+" Added ok.")    
        
        for a in meta_data[inpx_parser.sAuthor]:
            author=opdsdb.addauthor(a.replace(',',' '))
            opdsdb.addbauthor(book,author)

        for g in meta_data[inpx_parser.sGenre]:
            opdsdb.addbgenre(book,opdsdb.addgenre(g.lower().strip(strip_symbols)))
            
        for s in meta_data[inpx_parser.sSeries]:
            ser=opdsdb.addseries(s.strip())
            opdsdb.addbseries(book,ser,0)                         
                   
    def processinpx(self,name,full_path,file):            # имя inpx, путь к каталогу рут-либ, полный путь с именем inpx
        rel_file=os.path.relpath(file,config.SOPDS_ROOT_LIB)
        inpx_size = os.path.getsize(file)
        if config.SOPDS_INPX_SKIP_UNCHANGED and opdsdb.inpx_skip(rel_file,inpx_size):
            self.logger.info('Skip INPX file = '+file+'. Not changed.')
        else:    
            self.logger.info('Start process INPX file = '+file)
            opdsdb.addcattree(rel_file, opdsdb.CAT_INPX, inpx_size)
            inpx = inpx_parser.Inpx(file, self.inpx_callback, self.inpskip_callback)       
            inpx.INPX_TEST_ZIP = config.SOPDS_INPX_TEST_ZIP  
            inpx.INPX_TEST_FILES = config.SOPDS_INPX_TEST_FILES 
            inpx.parse()

    def processzip(self,name,full_path,file):
        rel_file=os.path.relpath(file,config.SOPDS_ROOT_LIB)
        zsize = os.path.getsize(file)
        if opdsdb.arc_skip(rel_file,zsize):
            self.arch_skipped+=1
            self.logger.debug('Skip ZIP archive '+rel_file+'. Already scanned.')
        else:                   
            zip_process_error = 0
            try:
                z = zipfile.ZipFile(file, 'r', allowZip64=True)
                filelist = z.namelist()
                cat = opdsdb.addcattree(rel_file, opdsdb.CAT_ZIP, zsize)
                for n in filelist:
                    try:
                        self.logger.debug('Start process ZIP file = '+file+' book file = '+n)
                        file_size=z.getinfo(n).file_size
                        bookfile = z.open(n)
                        self.processfile(n,file,bookfile,cat,opdsdb.CAT_ZIP,file_size)
                        bookfile.close()
                    except zipfile.BadZipFile:
                        self.logger.warning('Error processing ZIP file = '+file+' book file = '+n)
                        zip_process_error = 1
                z.close()
                self.arch_scanned+=1
            except zipfile.BadZipFile:
                self.logger.warning('Error while read ZIP archive. File '+file+' corrupt.')
                zip_process_error = 1
            self.bad_archives+=zip_process_error

    def processfile(self,name,full_path,file,cat,archive=0,file_size=0):
        (n, e) = os.path.splitext(name)
        if e.lower() in config.SOPDS_BOOK_EXTENSIONS.split():
            rel_path=os.path.relpath(full_path,config.SOPDS_ROOT_LIB)
            self.logger.debug("Attempt to add book "+rel_path+"/"+name)
            try:
                if opdsdb.findbook(name, rel_path, 1) == None:
                    if archive==0:
                        cat=opdsdb.addcattree(rel_path,archive)

                    try:
                        book_data = create_bookfile(file, name)
                    except Exception as err:
                        book_data = None
                        self.logger.warning(rel_path + ' - ' + name + ' Book parse error, skipping... (Error: %s)'%err)
                        self.bad_books += 1

                    if book_data:
                        lang = book_data.language_code.strip(strip_symbols) if book_data.language_code else ''
                        title = book_data.title.strip(strip_symbols) if book_data.title else n
                        annotation = book_data.description if book_data.description else ''
                        annotation = annotation.strip(strip_symbols) if isinstance(annotation, str) else annotation.decode('utf8').strip(strip_symbols)
                        docdate = book_data.docdate if book_data.docdate else ''

                        book=opdsdb.addbook(name,rel_path,cat,e[1:],title,annotation,docdate,lang,file_size,archive)
                        self.books_added+=1

                        if archive!=0:
                            self.books_in_archives+=1
                        self.logger.debug("Book "+rel_path+"/"+name+" Added ok.")

                        for a in book_data.authors:
                            author_name = a.get('name',_('Unknown author')).strip(strip_symbols)
                            # Если в имени автора нет запятой, то фамилию переносим из конца в начало
                            if author_name and author_name.find(',')<0:
                                author_names = author_name.split()
                                author_name = ' '.join([author_names[-1],' '.join(author_names[:-1])])
                            author=opdsdb.addauthor(author_name)
                            opdsdb.addbauthor(book,author)

                        for genre in book_data.tags:
                            opdsdb.addbgenre(book,opdsdb.addgenre(genre.lower().strip(strip_symbols)))


                        if book_data.series_info:
                            ser = opdsdb.addseries(book_data.series_info['title'])
                            ser_no = book_data.series_info['index']  or '0'
                            ser_no = int(ser_no) if ser_no.isdigit() else 0
                            opdsdb.addbseries(book,ser,ser_no)
                else:
                    self.books_skipped+=1
                    self.logger.debug("Book "+rel_path+"/"+name+" Already in DB.")
            except UnicodeEncodeError as err:
                self.logger.warning(rel_path + ' - ' + name + ' Book UnicodeEncodeError error, skipping... (Error: %s)' % err)
                self.bad_books += 1