From f81f10ad853add7542824e76f1ef8312eb53892d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20Pi=C4=8Dman?= Date: Tue, 29 May 2018 13:47:53 +0200 Subject: [PATCH] Xapian not indexing repository if project configuration is blank #857 --- extra/xapian_indexer.rb | 330 +++------------------------------------- 1 file changed, 23 insertions(+), 307 deletions(-) diff --git a/extra/xapian_indexer.rb b/extra/xapian_indexer.rb index 462b11c1..fc5164be 100644 --- a/extra/xapian_indexer.rb +++ b/extra/xapian_indexer.rb @@ -45,13 +45,10 @@ $dbrootpath = '/var/tmp/dmsf-index' $verbose = 0 # Define stemmed languages to index attachments Eg. [ 'english', 'italian', 'spanish' ] -# Repository database will be always indexed in english -# Available languages are danish dutch english finnish french german german2 hungarian italian kraaij_pohlmann lovins norwegian porter portuguese romanian russian spanish swedish turkish: +# Available languages are danish dutch english finnish french german german2 hungarian italian kraaij_pohlmann lovins +# norwegian porter portuguese romanian russian spanish swedish turkish: $stem_langs = ['english'] -# Project identifiers that will be indexed eg. [ 'prj_id1', 'prj_id2' ] -$projects = [ 'prj_id1', 'prj_id2' ] - # Temporary directory for indexing, it can be tmpfs $tempdir = '/tmp' @@ -69,13 +66,8 @@ $unrtf = '/usr/bin/unrtf -t text 2>/dev/null' ################################################################################################ $environment = File.join($redmine_root, 'config/environment.rb') -$projects = Array.new $databasepath = nil -$repositories = nil -$onlyfiles = nil -$onlyrepos = nil $env = 'production' -$resetlog = nil $retryfailed = nil MIME_TYPES = { @@ -112,30 +104,24 @@ FORMAT_HANDLERS = { require 'optparse' VERSION = '0.1' -SUPPORTED_SCM = %w(Subversion Darcs Mercurial Bazaar Git Filesystem) optparse = OptionParser.new do |opts| opts.banner = 'Usage: xapian_indexer.rb [OPTIONS...]' opts.separator('') - opts.separator('Index redmine files and repositories') + opts.separator('Index Redmine DMS documents') opts.separator('') opts.separator('') opts.separator('Options:') - opts.on('-p', '--projects a,b,c', Array, 'Comma separated list of projects to index') { |p| $projects = p } opts.on('-s', '--stemming_lang a,b,c', Array,'Comma separated list of stemming languages for indexing') { |s| $stem_langs = s } - opts.on('-v', '--verbose', 'verbose') {$verbose += 1} - opts.on('-f', '--files', 'Only index Redmine attachments') { $onlyfiles = 1 } - opts.on('-r', '--repositories', 'Only index Redmine repositories') { $onlyrepos = 1 } + opts.on('-v', '--verbose', 'verbose') {$verbose += 1}} opts.on('-e', '--environment ENV', 'Rails ENVIRONMENT (development, testing or production), default production') { |e| $env = e} - opts.on('-t', '--temp-dir PATH', 'Temporary directory for indexing'){ |t| $tempdir = t } - opts.on('-x', '--resetlog', 'Reset index log'){ $resetlog = 1 } + opts.on('-t', '--temp-dir PATH', 'Temporary directory for indexing'){ |t| $tempdir = t } opts.on('-V', '--version', 'show version and exit') { puts VERSION; exit} opts.on('-h', '--help', 'show help and exit') { puts opts; exit } opts.on('-R', '--retry-failed', 'retry files which omindex failed to extract text') { $retryfailed = 1 } opts.separator('') opts.separator('Examples:') - opts.separator(' xapian_indexer.rb -f -s english,italian -v') - opts.separator(' xapian_indexer.rb -p project_id -x -t /tmpfs -v') + opts.separator(' xapian_indexer.rb -s english,italian -v') opts.separator('') opts.summary_width = 25 end @@ -152,187 +138,11 @@ DELETE = 0 class IndexingError < StandardError; end -def repo_name(repository) - repository.identifier.blank? ? 'main' : repository.identifier -end - -def indexing(databasepath, project, repository) - log "Fetch changesets: #{project.name} - #{repo_name(repository)}" - repository.fetch_changesets - repository.reload.changesets.reload - - latest_changeset = repository.changesets.first - return unless latest_changeset - - log "Latest revision: #{project.name} - #{repo_name(repository)} - #{latest_changeset.revision}" - latest_indexed = Indexinglog.where(:repository_id => repository.id, :status => STATUS_SUCCESS).last - Rails.logger.debug "Debug latest_indexed #{latest_indexed.inspect}" - begin - indexconf = Tempfile.new('index.conf', $tempdir) - indexconf.write "url : field boolean=Q unique=Q\n" - indexconf.write "body : index truncate=400 field=sample\n" - indexconf.write "date: field=date\n" - indexconf.close - unless latest_indexed - log "Repository #{repo_name(repository)} not indexed, indexing all" - indexing_all(databasepath, indexconf, project, repository) - else - log "Repository #{repo_name(repository)} indexed, indexing diff" - indexing_diff(databasepath, indexconf, project, repository, - latest_indexed.changeset, latest_changeset) - end - indexconf.unlink - rescue IndexingError => e - add_log(repository, latest_changeset, STATUS_FAIL, e.message) - else - add_log(repository, latest_changeset, STATUS_SUCCESS) - log "Successfully indexed: #{project.name} - #{repo_name(repository)} - #{latest_changeset.revision}" - end -end - def supported_mime_type(entry) mtype = Redmine::MimeType.of(entry) MIME_TYPES.include?(mtype) || Redmine::MimeType.is_type?('text', mtype) end -def add_log(repository, changeset, status, message = nil) - log = Indexinglog.where(:repository_id => repository.id).last - unless log - log = Indexinglog.new - log.repository = repository - log.changeset = changeset - log.status = status - log.message = message if message - log.save! - log "New log for repo #{repo_name(repository)} saved!" - else - log.changeset_id=changeset.id - log.status=status - log.message = message if message - log.save! - log "Log for repo #{repo_name(repository)} updated!" - end -end - -def update_log(repository, changeset, status, message = nil) - log = Indexinglog.where(:repository_id => repository.id).last - if log - log.changeset_id = changeset.id - log.status = status if status - log.message = message if message - log.save! - log "Log for repo #{repo_name(repository)} updated!" - end -end - -def delete_log(repository) - Indexinglog.where(:repository_id => repository.id).delete_all - log "Log for repo #{repo_name(repository)} removed!" -end - -def walk(databasepath, indexconf, project, repository, identifier, entries) - return if entries.nil? || entries.size < 1 - log "Walk entries size: #{entries.size}" - entries.each do |entry| - log "Walking into: #{entry.lastrev.time}" - if entry.is_dir? - walk(databasepath, indexconf, project, repository, identifier, repository.entries(entry.path, identifier)) - elsif entry.is_file? - add_or_update_index(databasepath, indexconf, project, repository, identifier, entry.path, - entry.lastrev, ADD_OR_UPDATE, MIME_TYPES[Redmine::MimeType.of(entry.path)]) if supported_mime_type(entry.path) - end - end -end - -def indexing_all(databasepath, indexconf, project, repository) - Rails.logger.info "Indexing all: #{repo_name(repository)}" - if repository.branches - repository.branches.each do |branch| - log "Walking in branch: #{repo_name(repository)} - #{branch}" - walk(databasepath, indexconf, project, repository, branch, repository.entries(nil, branch)) - end - else - log "Walking in branch: #{repo_name(repository)} - [NOBRANCH]" - walk(databasepath, indexconf, project, repository, nil, repository.entries(nil, nil)) - end - if repository.tags - repository.tags.each do |tag| - log "Walking in tag: #{repo_name(repository)} - #{tag}" - walk(databasepath, indexconf, project, repository, tag, repository.entries(nil, tag)) - end - end -end - -def walkin(databasepath, indexconf, project, repository, identifier, changesets) - log "Walking into #{changesets.inspect}" - return unless changesets or changesets.size <= 0 - changesets.sort! { |a, b| a.id <=> b.id } - - actions = Hash::new - # SCM actions - # * A - Add - # * M - Modified - # * R - Replaced - # * D - Deleted - changesets.each do |changeset| - log "Changeset changes for #{changeset.id} #{changeset.filechanges.inspect}" - next unless changeset.filechanges - changeset.filechanges.each do |change| - actions[change.path] = (change.action == 'D') ? DELETE : ADD_OR_UPDATE - end - end - return unless actions - actions.each do |path, action| - entry = repository.entry(path, identifier) - if ((!entry.nil? && entry.is_file?) || action == DELETE) - log("Error indexing path: #{path.inspect}, action: #{action.inspect}, identifier: #{identifier.inspect}", true) if (entry.nil? && action != DELETE) - log "Entry to index #{entry.inspect}" - lastrev = entry.lastrev unless entry.nil? - add_or_update_index(databasepath, indexconf, project, repository, - identifier, path, lastrev, action, MIME_TYPES[Redmine::MimeType.of(path)]) if(supported_mime_type(path) || action == DELETE) - end - end - end - -def indexing_diff(databasepath, indexconf, project, repository, diff_from, diff_to) - if diff_from.id >= diff_to.id - log "Already indexed: #{repo_name(repository)} (from: #{diff_from.id} to #{diff_to.id})" - return - end - - log "Indexing diff: #{repo_name(repository)} (from: #{diff_from.id} to #{diff_to.id})" - log "Indexing all: #{repo_name(repository)}" - - if repository.branches - repository.branches.each do |branch| - log "Walking in branch: #{repo_name(repository)} - #{branch}" - walkin(databasepath, indexconf, project, repository, branch, repository.latest_changesets('', branch, diff_to.id - diff_from.id).select { |changeset| - changeset.id > diff_from.id and changeset.id <= diff_to.id}) - end - else - log "Walking in branch: #{repo_name(repository)} - [NOBRANCH]" - walkin(databasepath, indexconf, project, repository, nil, repository.latest_changesets('', nil, diff_to.id - diff_from.id).select { |changeset| - changeset.id > diff_from.id and changeset.id <= diff_to.id}) - end - if repository.tags - repository.tags.each do |tag| - log "Walking in tag: #{repo_name(repository)} - #{tag}" - walkin(databasepath, indexconf, project, repository, tag, repository.latest_changesets('', tag, diff_to.id - diff_from.id).select { |changeset| - changeset.id > diff_from.id and changeset.id <= diff_to.id}) - end - end -end - -def generate_uri(project, repository, identifier, path) - return url_for(:controller => 'repositories', - :action => 'entry', - :id => project.identifier, - :repository_id => repository.identifier, - :rev => identifier, - :path => repository.relative_path(path), - :only_path => true) - end - def convert_to_text(fpath, type) text = nil return text if !File.exist?(FORMAT_HANDLERS[type].split(' ').first) @@ -363,56 +173,6 @@ def convert_to_text(fpath, type) return text end -def add_or_update_index(databasepath, indexconf, project, repository, identifier, - path, lastrev, action, type) - uri = generate_uri(project, repository, identifier, path) - return unless uri - text = nil - if Redmine::MimeType.is_type?('text', path) || (%(js).include?(type)) - text = repository.cat(path, identifier) - else - fname = path.split('/').last.tr(' ', '_') - bstr = nil - bstr = repository.cat(path, identifier) - File.open( "#{$tempdir}/#{fname}", 'wb+') do | bs | - bs.write(bstr) - end - text = convert_to_text("#{$tempdir}/#{fname}", type) if File.exist?("#{$tempdir}/#{fname}") and !bstr.nil? - File.unlink("#{$tempdir}/#{fname}") - end - log "generated uri: #{uri}" - log('Mime type text') if Redmine::MimeType.is_type?('text', path) - log "Indexing: #{path}" - begin - itext = Tempfile.new('filetoindex.tmp', $tempdir) - itext.write("url=#{uri.to_s}\n") - if action != DELETE - sdate = lastrev.time || Time.at(0).in_time_zone - itext.write("date=#{sdate.to_s}\n") - body = nil - text.force_encoding('UTF-8') - text.each_line do |line| - if body.blank? - itext.write("body=#{line}") - body = 1 - else - itext.write("=#{line}") - end - end - else - log "Path: #{path} should be deleted" - end - itext.close - log "TEXT #{itext.path} generated" - log "Index command: #{$scriptindex} -s #{$user_stem_lang} #{databasepath} #{indexconf.path} #{itext.path}" - system_or_raise("#{$scriptindex} -s english #{databasepath} #{indexconf.path} #{itext.path}") - itext.unlink - log 'New doc added to xapian database' - rescue Exception => e - log "Text not indexed beacause an error #{e.message}", true - end -end - def log(text, error = false) if error $stderr.puts text @@ -429,16 +189,6 @@ def system_or_raise(command) end end -def find_project(prt) - project = Project.active.has_module(:repository).find_by_identifier(prt) - if project - log "Project found: #{project}" - else - log "Project #{prt} not found", true - end - @project = project -end - log "Trying to load Redmine environment <<#{$environment}>>..." begin @@ -453,67 +203,33 @@ include Rails.application.routes.url_helpers log "Redmine environment [RAILS_ENV=#{$env}] correctly loaded ..." -# Indexing files -unless $onlyrepos - unless File.exist?($omindex) - log "#{$omindex} does not exist, exiting...", true - exit 1 - end - $stem_langs.each do | lang | - filespath = File.join($redmine_root, $files) - unless File.directory?(filespath) - log "An error while accessing #{filespath}, exiting...", true - exit 1 - end - databasepath = File.join($dbrootpath, lang) - unless File.directory?(databasepath) - log "#{databasepath} does not exist, creating ..." - begin - FileUtils.mkdir_p databasepath - rescue Exception => e - log e.message, true - exit 1 - end - end - cmd = "#{$omindex} -s #{lang} --db #{databasepath} #{filespath} --url / --depth-limit=0" - cmd << ' -v' if $verbose > 0 - cmd << ' --retry-failed' if $retryfailed - log cmd - system_or_raise (cmd) - end - log 'Redmine files indexed' +# Indexing documents +unless File.exist?($omindex) + log "#{$omindex} does not exist, exiting...", true + exit 1 end - -# Indexing repositories -unless $onlyfiles - unless File.exist?($scriptindex) - log "#{$scriptindex} does not exist, exiting...", true +$stem_langs.each do | lang | + filespath = File.join($redmine_root, $files) + unless File.directory?(filespath) + log "An error while accessing #{filespath}, exiting...", true exit 1 end - databasepath = File.join($dbrootpath.rstrip, 'repodb') + databasepath = File.join($dbrootpath, lang) unless File.directory?(databasepath) - log "Db directory #{databasepath} does not exist, creating..." + log "#{databasepath} does not exist, creating ..." begin - FileUtils.mkdir_p databasepath + FileUtils.mkdir_p databasepath rescue Exception => e log e.message, true exit 1 - end - end - $projects.each do |identifier| - begin - project = Project.active.has_module(:repository).where(:identifier => identifier).preload(:repository).first - raise ActiveRecord::RecordNotFound unless project - log "Indexing repositories for #{project.name}..." - repositories = project.repositories.select { |repository| repository.supports_cat? } - repositories.each do |repository| - delete_log(repository) if ($resetlog) - indexing(databasepath, project, repository) - end - rescue ActiveRecord::RecordNotFound - log "Project identifier #{identifier} not found or repository module not enabled, ignoring..." end end + cmd = "#{$omindex} -s #{lang} --db #{databasepath} #{filespath} --url / --depth-limit=0" + cmd << ' -v' if $verbose > 0 + cmd << ' --retry-failed' if $retryfailed + log cmd + system_or_raise (cmd) end +log 'Redmine DMS documents indexed' exit 0 \ No newline at end of file