redmine_dmsf/extra/xapian_indexer.rb

235 lines
7.5 KiB
Ruby

#!/usr/bin/ruby -W0
# encoding: utf-8
#
# Redmine Xapian is a Redmine plugin to allow attachments searches by content.
#
# Copyright © 2010 Xabier Elkano
# Copyright © 2011-18 Karel Pičman <karel.picman@kontron.com>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
################################################################################################
# BEGIN Configuration parameters
# Configure the following parameters (most of them can be configured through the command line):
################################################################################################
# Redmine installation directory
$redmine_root = '/opt/redmine'
# DMSF document location $redmine_root/$files
$files = 'dmsf'
# scriptindex binary path
$scriptindex = '/usr/bin/scriptindex'
# omindex binary path
$omindex = '/usr/bin/omindex'
# Directory containing xapian databases for omindex (Attachments indexing)
$dbrootpath = '/var/tmp/dmsf-index'
# Verbose output, values of 0 no verbose, greater than 0 verbose output
$verbose = 0
# Define stemmed languages to index attachments Eg. [ 'english', 'italian', 'spanish' ]
# Available languages are danish dutch english finnish french german german2 hungarian italian kraaij_pohlmann lovins
# norwegian porter portuguese romanian russian spanish swedish turkish:
$stem_langs = ['english']
# Temporary directory for indexing, it can be tmpfs
$tempdir = '/tmp'
# Binaries for text conversion
$pdftotext = '/usr/bin/pdftotext -enc UTF-8'
$antiword = '/usr/bin/antiword'
$catdoc = '/usr/bin/catdoc'
$xls2csv = '/usr/bin/xls2csv'
$catppt = '/usr/bin/catppt'
$unzip = '/usr/bin/unzip -o'
$unrtf = '/usr/bin/unrtf -t text 2>/dev/null'
################################################################################################
# END Configuration parameters
################################################################################################
$environment = File.join($redmine_root, 'config/environment.rb')
$databasepath = nil
$env = 'production'
$retryfailed = nil
MIME_TYPES = {
'application/pdf' => 'pdf',
'application/rtf' => 'rtf',
'application/msword' => 'doc',
'application/vnd.ms-excel' => 'xls',
'application/vnd.ms-powerpoint' => 'ppt,pps',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'docx',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => 'xlsx',
'application/vnd.openxmlformats-officedocument.presentationml.presentation' => 'pptx',
'application/vnd.openxmlformats-officedocument.presentationml.slideshow' => 'ppsx',
'application/vnd.oasis.opendocument.spreadsheet' => 'ods',
'application/vnd.oasis.opendocument.text' => 'odt',
'application/vnd.oasis.opendocument.presentation' => 'odp',
'application/javascript' => 'js'
}.freeze
FORMAT_HANDLERS = {
'pdf' => $pdftotext,
'doc' => $catdoc,
'xls' => $xls2csv,
'ppt,pps' => $catppt,
'docx' => $unzip,
'xlsx' => $unzip,
'pptx' => $unzip,
'ppsx' => $unzip,
'ods' => $unzip,
'odt' => $unzip,
'odp' => $unzip,
'rtf' => $unrtf
}.freeze
require 'optparse'
VERSION = '0.1'
optparse = OptionParser.new do |opts|
opts.banner = 'Usage: xapian_indexer.rb [OPTIONS...]'
opts.separator('')
opts.separator('Index Redmine DMS documents')
opts.separator('')
opts.separator('')
opts.separator('Options:')
opts.on('-s', '--stemming_lang a,b,c', Array,'Comma separated list of stemming languages for indexing') { |s| $stem_langs = s }
opts.on('-v', '--verbose', 'verbose') {$verbose += 1}
opts.on('-e', '--environment ENV', 'Rails ENVIRONMENT (development, testing or production), default production') { |e| $env = e}
opts.on('-t', '--temp-dir PATH', 'Temporary directory for indexing'){ |t| $tempdir = t }
opts.on('-V', '--version', 'show version and exit') { puts VERSION; exit}
opts.on('-h', '--help', 'show help and exit') { puts opts; exit }
opts.on('-R', '--retry-failed', 'retry files which omindex failed to extract text') { $retryfailed = 1 }
opts.separator('')
opts.separator('Examples:')
opts.separator(' xapian_indexer.rb -s english,italian -v')
opts.separator('')
opts.summary_width = 25
end
optparse.parse!
ENV['RAILS_ENV'] = $env
STATUS_SUCCESS = 1
STATUS_FAIL = -1
ADD_OR_UPDATE = 1
DELETE = 0
class IndexingError < StandardError; end
def supported_mime_type(entry)
mtype = Redmine::MimeType.of(entry)
MIME_TYPES.include?(mtype) || Redmine::MimeType.is_type?('text', mtype)
end
def convert_to_text(fpath, type)
text = nil
return text if !File.exist?(FORMAT_HANDLERS[type].split(' ').first)
case type
when 'pdf'
text = `#{FORMAT_HANDLERS[type]} #{fpath} -`
when /(xlsx|docx|odt|pptx)/i
system "#{$unzip} -d #{$tempdir}/temp #{fpath} > /dev/null", :out=>'/dev/null'
case type
when 'xlsx'
fout = "#{$tempdir}/temp/xl/sharedStrings.xml"
when 'docx'
fout = "#{$tempdir}/temp/word/document.xml"
when 'odt'
fout = "#{$tempdir}/temp/content.xml"
when 'pptx'
fout = "#{$tempdir}/temp/docProps/app.xml"
end
begin
text = File.read(fout)
FileUtils.rm_rf("#{$tempdir}/temp")
rescue Exception => e
log "Error: #{e.to_s} reading #{fout}", true
end
else
text = `#{FORMAT_HANDLERS[type]} #{fpath}`
end
return text
end
def log(text, error = false)
if error
$stderr.puts text
elsif $verbose > 0
$stdout.puts text
end
end
def system_or_raise(command)
if $verbose > 0
raise "\"#{command}\" failed" unless system command
else
raise "\"#{command}\" failed" unless system command, :out => '/dev/null'
end
end
log "Trying to load Redmine environment <<#{$environment}>>..."
begin
require $environment
rescue LoadError
log "Redmine #{$environment} cannot be loaded!! Be sure the redmine installation directory is correct!", true
log 'Edit script and correct path', true
exit 1
end
include Rails.application.routes.url_helpers
log "Redmine environment [RAILS_ENV=#{$env}] correctly loaded ..."
# Indexing documents
unless File.exist?($omindex)
log "#{$omindex} does not exist, exiting...", true
exit 1
end
$stem_langs.each do | lang |
filespath = File.join($redmine_root, $files)
unless File.directory?(filespath)
log "An error while accessing #{filespath}, exiting...", true
exit 1
end
databasepath = File.join($dbrootpath, lang)
unless File.directory?(databasepath)
log "#{databasepath} does not exist, creating ..."
begin
FileUtils.mkdir_p databasepath
rescue Exception => e
log e.message, true
exit 1
end
end
cmd = "#{$omindex} -s #{lang} --db #{databasepath} #{filespath} --url / --depth-limit=0"
cmd << ' -v' if $verbose > 0
cmd << ' --retry-failed' if $retryfailed
log cmd
system_or_raise (cmd)
end
log 'Redmine DMS documents indexed'
exit 0