redmine_dmsf/extra/xapian_indexer.rb
2015-09-04 13:25:59 +02:00

545 lines
20 KiB
Ruby

#!/usr/bin/ruby -W0
# encoding: utf-8
#
# Redmine Xapian is a Redmine plugin to allow attachments searches by content.
#
# Copyright (C) 2010 Xabier Elkano
# Copyright (C) 2015 Karel Pičman <karel.picman@kontron.com>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
################################################################################################
# BEGIN Configuration parameters
# Configure the following parameters (most of them can be configured through the command line):
################################################################################################
# Redmine installation directory
$redmine_root = '/opt/redmine'
# Files location
$files = 'files'
# scriptindex binary path
$scriptindex = '/usr/bin/scriptindex'
# omindex binary path
$omindex = '/usr/bin/omindex'
# Directory containing xapian databases for omindex (Attachments indexing)
$dbrootpath = '/var/tmp/omindex'
# Verbose output, values of 0 no verbose, greater than 0 verbose output
$verbose = 0
# Define stemmed languages to index attachments Ej [ 'english', 'italian', 'spanish' ]
# Repository database will be always indexed in english
# Available languages are danish dutch english finnish french german german2 hungarian italian kraaij_pohlmann lovins norwegian porter portuguese romanian russian spanish swedish turkish:
$stem_langs = ['english']
# Project identifiers that will be indexed eg. [ 'prj_id1', 'prj_id2' ]
projects = [ 'prj_id1', 'prj_id2' ]
# Temporary directory for indexing, it can be tmpfs
$tempdir = '/tmp'
# Binaries for text conversion
$pdftotext = '/usr/bin/pdftotext -enc UTF-8'
$antiword = '/usr/bin/antiword'
$catdoc = '/usr/bin/catdoc'
$xls2csv = '/usr/bin/xls2csv'
$catppt = '/usr/bin/catppt'
$unzip = '/usr/bin/unzip -o'
$unrtf = '/usr/bin/unrtf -t text 2>/dev/null'
################################################################################################
# END Configuration parameters
################################################################################################
$environment = File.join($redmine_root, 'config/environment.rb')
$project = nil
$databasepath = nil
$repositories = nil
$onlyfiles = nil
$onlyrepos = nil
$env = 'production'
$resetlog = nil
MIME_TYPES = {
'application/pdf' => 'pdf',
'application/rtf' => 'rtf',
'application/msword' => 'doc',
'application/vnd.ms-excel' => 'xls',
'application/vnd.ms-powerpoint' => 'ppt,pps',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'docx',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => 'xlsx',
'application/vnd.openxmlformats-officedocument.presentationml.presentation' => 'pptx',
'application/vnd.openxmlformats-officedocument.presentationml.slideshow' => 'ppsx',
'application/vnd.oasis.opendocument.spreadsheet' => 'ods',
'application/vnd.oasis.opendocument.text' => 'odt',
'application/vnd.oasis.opendocument.presentation' => 'odp'
}.freeze
FORMAT_HANDLERS = {
'pdf' => $pdftotext,
'doc' => $catdoc,
'xls' => $xls2csv,
'ppt,pps' => $catppt,
'docx' => $unzip,
'xlsx' => $unzip,
'pptx' => $unzip,
'ppsx' => $unzip,
'ods' => $unzip,
'odt' => $unzip,
'odp' => $unzip,
'rtf' => $unrtf
}.freeze
require 'optparse'
VERSION = '0.1'
SUPPORTED_SCM = %w(Subversion Darcs Mercurial Bazaar Git Filesystem)
optparse = OptionParser.new do |opts|
opts.banner = 'Usage: xapian_indexer.rb [OPTIONS...]'
opts.separator('')
opts.separator('Index redmine files and repositories')
opts.separator('')
opts.separator('')
opts.separator('Options:')
opts.on('-p', '--projects a,b,c', Array, 'Comma separated list of projects to index') { |p| projects = p }
opts.on('-s', '--stemming_lang a,b,c', Array,'Comma separated list of stemming languages for indexing') { |s| $stem_langs = s }
opts.on('-v', '--verbose', 'verbose') {$verbose += 1}
opts.on('-f', '--files', 'Only index Redmine attachments') { $onlyfiles = 1 }
opts.on('-r', '--repositories', 'Only index Redmine repositories') { $onlyrepos = 1 }
opts.on('-e', '--environment ENV', 'Rails ENVIRONMENT (development, testing or production), default production') { |e| $env = e}
opts.on('-t', '--temp-dir PATH', 'Temporary directory for indexing'){ |t| $tempdir = t }
opts.on('-x', '--resetlog', 'Reset index log'){ $resetlog = 1 }
opts.on('-V', '--version', 'show version and exit') { puts VERSION; exit}
opts.on('-h', '--help', 'show help and exit') { puts opts; exit }
opts.separator('')
opts.separator('Examples:')
opts.separator(' xapian_indexer.rb -f -s english,italian -v')
opts.separator(' xapian_indexer.rb -p project_id -x -t /tmpfs -v')
opts.separator('')
opts.summary_width = 25
end
optparse.parse!
ENV['RAILS_ENV'] = $env
STATUS_SUCCESS = 1
STATUS_FAIL = -1
ADD_OR_UPDATE = 1
DELETE = 0
class IndexingError < StandardError; end
def repo_name(repository)
repository.identifier.blank? ? 'main' : repository.identifier
end
def indexing(databasepath, project, repository)
Rails.logger.info("Fetch changesets: #{project.name} - #{repo_name(repository)}")
log("- Fetch changesets: #{project.name} - #{repo_name(repository)}", :level => 1)
repository.fetch_changesets
repository.reload.changesets.reload
latest_changeset = repository.changesets.first
return if not latest_changeset
Rails.logger.debug("Latest revision: #{project.name} - #{repo_name(repository)} - #{latest_changeset.revision}")
latest_indexed = Indexinglog.where(:repository_id => repository.id, :status => STATUS_SUCCESS).last
Rails.logger.debug "Debug latest_indexed #{latest_indexed.inspect}"
begin
indexconf = Tempfile.new('index.conf', $tempdir)
indexconf.write "url : field boolean=Q unique=Q\n"
indexconf.write "body : index truncate=400 field=sample\n"
indexconf.write "date: field=date\n"
indexconf.close
if not latest_indexed
Rails.logger.debug "Repository #{repo_name(repository)} not indexed, indexing all"
log("\t>repo #{repo_name(repository)} not indexed, indexing all", :level => 1)
indexing_all(databasepath, indexconf, project, repository)
else
Rails.logger.debug "Repository #{repo_name(repository)} indexed, indexing diff"
log("\t>repo #{repo_name(repository)} already indexed, indexing only diff", :level => 1)
indexing_diff(databasepath, indexconf, project, repository,
latest_indexed.changeset, latest_changeset)
end
indexconf.unlink
rescue IndexingError => e
add_log(repository, latest_changeset, STATUS_FAIL, e.message)
else
add_log(repository, latest_changeset, STATUS_SUCCESS)
Rails.logger.info("Successfully indexed: #{project.name} - #{repo_name(repository)} - #{latest_changeset.revision}")
end
end
def supported_mime_type(entry)
mtype = Redmine::MimeType.of(entry)
included = false
included = MIME_TYPES.include?(mtype) || mtype.split('/').first.eql?('text') unless mtype.nil?
return included
end
def add_log(repository, changeset, status, message = nil)
log = Indexinglog.where(:repository_id => repository.id).last
if not log
log = Indexinglog.new
log.repository = repository
log.changeset = changeset
log.status = status
log.message = message if message
log.save!
Rails.logger.info "New log for repo #{repo_name(repository)} saved!"
log("\t>New log for repo #{repo_name(repository)} saved!", :level => 1)
else
log.changeset_id=changeset.id
log.status=status
log.message = message if message
log.save!
Rails.logger.info "Log for repo #{repo_name(repository)} updated!"
log("\t>Log for repo #{repo_name(repository)} updated!", :level => 1)
end
end
def update_log(repository, changeset, status, message = nil)
log = Indexinglog.where(:repository_id => repository.id).last
if log
log.changeset_id = changeset.id
log.status = status if status
log.message = message if message
log.save!
Rails.logger.info("Log for repo #{repo_name(repository)} updated!")
log("\t>Log for repo #{repo_name(repository)} updated!", :level => 1)
end
end
def delete_log(repository)
Indexinglog.delete_all(:repository_id => repository.id)
Rails.logger.info("Log for repo #{repo_name(repository)} removed!")
log("\t>Log for repo #{repo_name(repository)} removed!", :level => 1)
end
def walk(databasepath, indexconf, project, repository, identifier, entries)
return if entries.nil? || entries.size < 1
Rails.logger.debug "Walk entries size: #{entries.size}"
entries.each do |entry|
Rails.logger.debug "Walking into: #{entry.lastrev.time}"
if entry.is_dir?
walk(databasepath, indexconf, project, repository, identifier, repository.entries(entry.path, identifier))
elsif entry.is_file?
add_or_update_index(databasepath, indexconf, project, repository, identifier, entry.path,
entry.lastrev, ADD_OR_UPDATE, MIME_TYPES[Redmine::MimeType.of(entry.path)]) if supported_mime_type(entry.path)
end
end
end
def indexing_all(databasepath, indexconf, project, repository)
Rails.logger.info "Indexing all: #{repo_name(repository)}"
if repository.branches
repository.branches.each do |branch|
Rails.logger.debug "Walking in branch: #{repo_name(repository)} - #{branch}"
walk(databasepath, indexconf, project, repository, branch, repository.entries(nil, branch))
end
else
Rails.logger.debug "Walking in branch: #{repo_name(repository)} - [NOBRANCH]"
walk(databasepath, indexconf, project, repository, nil, repository.entries(nil, nil))
end
if repository.tags
repository.tags.each do |tag|
Rails.logger.debug "Walking in tag: #{repo_name(repository)} - #{tag}"
walk(databasepath, indexconf, project, repository, tag, repository.entries(nil, tag))
end
end
end
def walkin(databasepath, indexconf, project, repository, identifier, changesets)
Rails.logger.debug "Walking into #{changesets.inspect}"
return if not changesets or changesets.size <= 0
changesets.sort! { |a, b| a.id <=> b.id }
actions = Hash::new
# SCM actions
# * A - Add
# * M - Modified
# * R - Replaced
# * D - Deleted
changesets.each do |changeset|
Rails.logger.debug "Changeset changes for #{changeset.id} #{changeset.filechanges.inspect}"
next unless changeset.filechanges
changeset.filechanges.each do |change|
actions[change.path] = (change.action == 'D') ? DELETE : ADD_OR_UPDATE
end
end
return unless actions
actions.each do |path, action|
entry = repository.entry(path, identifier)
if ((!entry.nil? && entry.is_file?) || action == DELETE)
log("Error indexing path: #{path.inspect}, action: #{action.inspect}, identifier: #{identifier.inspect}",
:level => 1) if (entry.nil? && action != DELETE)
Rails.logger.debug "Entry to index #{entry.inspect}"
lastrev = entry.lastrev unless entry.nil?
add_or_update_index(databasepath, indexconf, project, repository,
identifier, path, lastrev, action, MIME_TYPES[Redmine::MimeType.of(path)]) if(supported_mime_type(path) || action == DELETE)
end
end
end
def indexing_diff(databasepath, indexconf, project, repository, diff_from, diff_to)
if diff_from.id >= diff_to.id
Rails.logger.info "Already indexed: #{repo_name(repository)} (from: #{diff_from.id} to #{diff_to.id})"
log("\t>Already indexed: #{repo_name(repository)} (from #{diff_from.id} to #{diff_to.id})", :level => 1)
return
end
Rails.logger.info "Indexing diff: #{repo_name(repository)} (from: #{diff_from.id} to #{diff_to.id})"
Rails.logger.info "Indexing all: #{repo_name(repository)}"
if repository.branches
repository.branches.each do |branch|
Rails.logger.debug "Walking in branch: #{repo_name(repository)} - #{branch}"
walkin(databasepath, indexconf, project, repository, branch, repository.latest_changesets('', branch, diff_to.id - diff_from.id).select { |changeset|
changeset.id > diff_from.id and changeset.id <= diff_to.id})
end
else
Rails.logger.debug "Walking in branch: #{repo_name(repository)} - [NOBRANCH]"
walkin(databasepath, indexconf, project, repository, nil, repository.latest_changesets('', nil, diff_to.id - diff_from.id).select { |changeset|
changeset.id > diff_from.id and changeset.id <= diff_to.id})
end
if repository.tags
repository.tags.each do |tag|
Rails.logger.debug "Walking in tag: #{repo_name(repository)} - #{tag}"
walkin(databasepath, indexconf, project, repository, tag, repository.latest_changesets('', tag, diff_to.id - diff_from.id).select { |changeset|
changeset.id > diff_from.id and changeset.id <= diff_to.id})
end
end
end
def generate_uri(project, repository, identifier, path)
return url_for(:controller => 'repositories',
:action => 'entry',
:id => project.identifier,
:repository_id => repository.identifier,
:rev => identifier,
:path => repository.relative_path(path),
:only_path => true)
end
def print_and_flush(str)
print str
$stdout.flush
end
def convert_to_text(fpath, type)
text = nil
return text if !File.exists?(FORMAT_HANDLERS[type].split(' ').first)
case type
when 'pdf'
text = "#{FORMAT_HANDLERS[type]} #{fpath} -"
when /(xlsx|docx|odt|pptx)/i
system "#{$unzip} -d #{$tempdir}/temp #{fpath} > /dev/null", :out=>'/dev/null'
case type
when 'xlsx'
fout = "#{$tempdir}/temp/xl/sharedStrings.xml"
when 'docx'
fout = "#{$tempdir}/temp/word/document.xml"
when 'odt'
fout = "#{$tempdir}/temp/content.xml"
when 'pptx'
fout = "#{$tempdir}/temp/docProps/app.xml"
end
begin
text = File.read(fout)
FileUtils.rm_rf("#{$tempdir}/temp")
rescue Exception => e
log("\tError: #{e.to_s} reading #{fout}", :level => 1)
end
else
text = "#{FORMAT_HANDLERS[type]} #{fpath}"
end
return text
end
def add_or_update_index(databasepath, indexconf, project, repository, identifier,
path, lastrev, action, type)
uri = generate_uri(project, repository, identifier, path)
return unless uri
text = nil
if Redmine::MimeType.is_type?('text', path) #type eq 'txt'
text = repository.cat(path, identifier)
else
fname = path.split( '/').last.tr(' ', '_')
bstr = nil
bstr = repository.cat(path, identifier)
File.open( "#{$tempdir}/#{fname}", 'wb+') do | bs |
bs.write(bstr)
end
text = convert_to_text("#{$tempdir}/#{fname}", type) if File.exists?("#{$tempdir}/#{fname}") and !bstr.nil?
File.unlink("#{$tempdir}/#{fname}")
end
log "generated uri: #{uri}", :lebel => 1
log('Mime type text', :level => 1) if Redmine::MimeType.is_type?('text', path)
log("\t>Indexing: #{path}", :level => 1)
begin
itext = Tempfile.new('filetoindex.tmp', $tempdir)
itext.write("url=#{uri.to_s}\n")
if action != DELETE then
sdate = lastrev.time || Time.at(0).in_time_zone
itext.write("date=#{sdate.to_s}\n")
body = nil
text.force_encoding('UTF-8')
text.each_line do |line|
if body.blank?
itext.write("body=#{line}")
body = 1
else
itext.write("=#{line}")
end
end
else
log "Path: #{path} should be deleted", :level => 1
end
itext.close
log "TEXT #{itext.path} generated", :level => 1
log "Index command: #{$scriptindex} -s #{$user_stem_lang} #{databasepath} #{indexconf.path} #{itext.path}", :level => 1
system_or_raise("#{$scriptindex} -s english #{databasepath} #{indexconf.path} #{itext.path}")
itext.unlink
log 'New doc added to xapian database'
rescue Exception => e
log "Text not indexed beacause an error #{e.message}"
end
end
def log(text, options={})
level = options[:level] || 0
dtext = Time.now.asctime.to_s + ": #{text}"
puts dtext unless level > $verbose
exit 1 if options[:exit]
end
def system_or_raise(command)
if $verbose > 0
raise "\"#{command}\" failed" unless system command
else
raise "\"#{command}\" failed" unless system command, :out => '/dev/null'
end
end
def find_project(prt)
scope = Project.active.has_module(:repository)
project = nil
project = scope.find_by_identifier(prt)
Rails.logger.debug "Project found: #{project}"
raise ActiveRecord::RecordNotFound unless project
rescue ActiveRecord::RecordNotFound
log("- ERROR project #{prt} not found", :level => 1)
@project = project
end
def create_dir(path)
begin
Dir.mkdir path
sleep 1
rescue SystemCallError
return 1
end
return 0
end
log("- Trying to load Redmine environment <<#{$environment}>>...", :level => 1)
begin
require $environment
rescue LoadError
puts "\n\tRedmine #{$environment} cannot be loaded!! Be sure the redmine installation directory is correct!\n"
puts "\tEdit script and correct path\n\n"
exit 1
end
include Rails.application.routes.url_helpers
log("- Redmine environment [RAILS_ENV=#{$env}] correctly loaded ...", :level => 1)
if $test
log('- Running in test mode ...')
end
# Indexing files
if not $onlyrepos then
if not File.exist?($omindex) then
log("- ERROR! #{$omindex} does not exist, exiting...")
exit 1
end
$stem_langs.each do | lang |
filespath = File.join($redmine_root, $files)
if not File.directory?(filespath) then
log("- ERROR accessing #{filespath}, exiting...")
exit 1
end
dbpath = File.join($dbrootpath,lang)
if not File.directory?(dbpath)
log("- #{dbpath} does not exist, creating ...")
if not create_dir(dbpath) then
log("- ERROR! #{dbpath} can not be created!, exiting...")
exit 1
end
end
log("- Indexing files under #{filespath} with omindex stemming in #{lang} ...", :level => 1)
system_or_raise ("#{$omindex} -s #{lang} --db #{dbpath} #{filespath} --url / > /dev/null")
end
log('- Redmine files indexed ...', :level => 1)
end
# Indexing repositories
if not $onlyfiles then
if not File.exist?($scriptindex) then
log("- ERROR! #{$scriptindex} does not exist, exiting...")
exit 1
end
databasepath = File.join($dbrootpath.rstrip, 'repodb')
log(databasepath)
if not File.directory?(databasepath)
log("- Db directory #{databasepath} does not exist, creating...")
begin
Dir.mkdir(databasepath)
sleep 1
rescue SystemCallError
log("- ERROR! #{databasepath} can not be created!, exiting ...")
exit 1
end
end
projects.each do |identifier|
begin
project = Project.active.has_module(:repository).where(:identifier => identifier).preload(:repository).first
raise ActiveRecord::RecordNotFound unless project
log("- Indexing repositories for #{project.name}...", :level => 1)
repositories = project.repositories.select { |repository| repository.supports_cat? }
repositories.each do |repository|
delete_log(repository) if ($resetlog)
indexing(databasepath, project, repository)
end
rescue ActiveRecord::RecordNotFound
log("- WARNING project identifier #{identifier} not found or repository module not enabled, ignoring...", :level => 1)
Rails.logger.error "Project identifier #{identifier} not found"
end
end
end
exit 0