無法在Lucene + Tika上返回帶有重音的單詞的結果

實施了Lucene和Tika Apache庫軟件包，並設法使其能夠非常好地滿足我的需求。但是我在口頭上有一個單詞的問題，它不能返回帶有口音的單詞的結果。無法在Lucene + Tika上返回帶有重音的單詞的結果

Indexador.java

package br.com.ir; 

import org.apache.log4j.Logger; 
import org.apache.lucene.analysis.br.BrazilianAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.document.TextField; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.index.IndexWriterConfig; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.SimpleFSDirectory; 
import org.apache.lucene.util.Version; 
import org.apache.tika.Tika; 

import javax.swing.*; 
import java.io.File; 
import java.io.FilenameFilter; 
import java.io.IOException; 
import java.text.SimpleDateFormat; 

class Indexador { 
    private static final Logger logger = Logger.getLogger(Indexador.class); 
    private IndexWriter writer; 
    private Tika tika; 
    private int qntArq = 0; 

    public void iniciaIndexacao() { 
     try { 
      File diretorio = new File(ArquivoDeConfiguracao.retornaValorIndice()); 

      apagaIndices(diretorio); 

      Directory d = new SimpleFSDirectory(diretorio); 

      BrazilianAnalyzer analyzer = new BrazilianAnalyzer(Version.LUCENE_4_9); 

      IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9, 
        analyzer); 

      writer = new IndexWriter(d, config); 

      long inicio = System.currentTimeMillis(); 
      indexaArquivosDoDiretorio(new File(ArquivoDeConfiguracao.retornaValorFonte())); 
      // Fecha o IndexWriter e comita as mudanças 
      writer.commit(); 
      long fim = System.currentTimeMillis(); 
      JOptionPane.showMessageDialog(
        null, 
        "Quantidade de arquivos indexados: " + qntArq + "\n" 
          + "Tempo para indexar: " 
          + String.valueOf((fim - inicio)/1000) + "s" 
      ); 
     } catch (IOException e) { 
      logger.error(e); 
     } finally { 
      if (writer != null) try { 
       writer.close(); 
      } catch (IOException e) { 
       e.printStackTrace(); 
      } 
     } 
    } 

    private void apagaIndices(File diretorio) { 
     if (diretorio.exists()) { 
      File arquivos[] = diretorio.listFiles(); 
      if (arquivos != null) { 
       for (File arquivo : arquivos) { 
        arquivo.delete(); 
       } 
      } 
     } 
    } 

    void indexaArquivosDoDiretorio(File raiz) { 
     FilenameFilter filtro = new FilenameFilter() { 
      public boolean accept(File arquivo, String nome) { 
       return nome.toLowerCase().endsWith(".pdf") 
         || nome.toLowerCase().endsWith(".odt") 
         || nome.toLowerCase().endsWith(".doc") 
         || nome.toLowerCase().endsWith(".docx") 
         || nome.toLowerCase().endsWith(".ppt") 
         || nome.toLowerCase().endsWith(".pptx") 
         || nome.toLowerCase().endsWith(".xls") 
         || nome.toLowerCase().endsWith(".xlsx") 
         || nome.toLowerCase().endsWith(".txt") 
         || nome.toLowerCase().endsWith(".rtf") 
         || nome.toLowerCase().endsWith(""); 
      } 
     }; 

     for (File arquivo : raiz.listFiles(filtro)) { 
      if (arquivo.isFile()) { 
       try { 
        String textoExtraido = getTika().parseToString(arquivo); 
        indexaArquivo(arquivo, textoExtraido); 
        qntArq++; 
       } catch (Exception e) { 
        logger.error(e); 
       } 
      } else { 
       indexaArquivosDoDiretorio(arquivo); 
      } 
     } 
    } 

    private void indexaArquivo(File arquivo, String textoExtraido) { 
     SimpleDateFormat formatador = new SimpleDateFormat("ddMMyyyy"); 

     String ultimaModificacao = formatador.format(arquivo.lastModified()); 
     Document documento = new Document(); 
     documento.add(new TextField("UltimaModificacao", ultimaModificacao, 
       Field.Store.YES)); 
     documento.add(new TextField("Caminho", arquivo.getAbsolutePath(), 
       Field.Store.YES)); 
     documento.add(new TextField("Texto", textoExtraido, Field.Store.YES)); 

     try { 
      getWriter().addDocument(documento); 
     } catch (IOException e) { 
      logger.error(e); 
     } 
    } 

    Tika getTika() { 
     if (tika == null) { 
      tika = new Tika(); 
     } 
     return tika; 
    } 

    IndexWriter getWriter() { 
     return writer; 
    } 
}

Buscador.java

package br.com.ir; 

import org.apache.log4j.Logger; 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.index.DirectoryReader; 
import org.apache.lucene.index.IndexReader; 
import org.apache.lucene.queryparser.classic.QueryParser; 
import org.apache.lucene.search.IndexSearcher; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.search.ScoreDoc; 
import org.apache.lucene.search.TopDocs; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.SimpleFSDirectory; 
import org.apache.lucene.util.Version; 

import javax.swing.*; 
import java.io.File; 

class Buscador { 
    private final static Logger logger = Logger.getLogger(Buscador.class); 
    public int totalDeOcorrencias; 
    public String quantBusca; 

    @SuppressWarnings({"unchecked", "rawtypes"}) 
    public Buscador(String parametro, DefaultListModel listModel) { 
     try { 
      Directory diretorio = new SimpleFSDirectory(new File(
        ArquivoDeConfiguracao.retornaValorIndice())); 
      // IndexReader: classe abstrata responsável por acessar 
      // o índice; 
      IndexReader leitor = DirectoryReader.open(diretorio); 
      // IndexSearcher: implementa os métodos necessários para 
      // realizar buscas em um índice; 
      IndexSearcher buscador = new IndexSearcher(leitor); 
      Analyzer analisador = new StandardAnalyzer(Version.LUCENE_4_9); 
      // QueryParser/Query: representa a consulta do usuário. Outros 
      // exemplos de query podem ser vistos no Javadoc; 
      QueryParser parser = new QueryParser(Version.LUCENE_4_9, "Texto", 
        analisador); 
      Query consulta = parser.parse(parametro); 
      long inicio = System.currentTimeMillis(); 
      // Realiza a busca e armazena o resultado em um TopDocs; 
      TopDocs resultado = buscador.search(consulta, 100); 
      long fim = System.currentTimeMillis(); 
      totalDeOcorrencias = resultado.totalHits; 
      quantBusca = String.valueOf((fim - inicio)/1000); 

      // ScoreDoc: representa cada um dos documentos retornados na busca. 
      for (ScoreDoc sd : resultado.scoreDocs) { 
       Document documento = buscador.doc(sd.doc); 
       // Preenche o modelo 
       listModel.addElement(documento.get("Caminho")); 
      } 
      leitor.close(); 
     } catch (Exception e) { 
      logger.error(e); 
     } 
    } 
}

Indexador.java是，文件和索引它在文件中的目錄裏面搜索類。然後用Buscador.java執行搜索，但是當我嘗試搜索諸如「plástico」或「coração」（假設有這些文件的文件）時，它不會返回任何內容。

來源

2014-07-18 Strokes

我沒有看到你是如何在提供的代碼搜索。除此之外，還有一個沒有得到預期結果的查詢示例。 – femtoRgon

問題已更新 – Strokes

您在索引時使用BrazilianAnalyzer，我假設它有一些特定的重音詞處理（以及詞幹分析），並且在查詢時使用StandardAnalyzer，但沒有這些語言特定的增強功能。

要解決這個問題，請使用相同的分析儀。可能BrazilianAnalyzer是你想要去的方式，因此，在Buscador.java，更改爲：

Analyzer analisador = new BrazilianAnalyzer(Version.LUCENE_4_9); 
QueryParser parser = new QueryParser(Version.LUCENE_4_9, "Texto", analisador);

通常你應該使用相同的分析和索引和搜索時間（除非你有一個很好的理由不這樣做）。

來源

2014-07-18 17:52:10 femtoRgon

你說得對，我不知道我怎麼沒看到它！ – Strokes

無法在Lucene + Tika上返回帶有重音的單詞的結果

回答

相關問題