2013-01-24 45 views
0

我正在努力弄清楚如何讓數據導入處理程序的splitBy結構起作用。我期待它將輸入列分成多值字段。這裏有一個測試用例來重現問題:將數據庫列拆分爲多值Solr字段

import java.io.File; 
import java.io.IOException; 
import java.sql.SQLException; 

import static org.junit.Assert.*; 

import javax.sql.DataSource; 

import org.apache.commons.dbutils.QueryRunner; 
import org.apache.commons.io.FileUtils; 
import org.apache.solr.client.solrj.SolrQuery; 
import org.apache.solr.client.solrj.SolrServer; 
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; 
import org.apache.solr.client.solrj.response.QueryResponse; 
import org.apache.solr.common.SolrDocument; 
import org.apache.solr.core.CoreContainer; 
import org.hsqldb.jdbc.JDBCDataSource; 
import org.junit.After; 
import org.junit.Before; 
import org.junit.Test; 

public class TestSplitBy { 

    SolrServer server; 
    File configPath = new File(FileUtils.getTempDirectory(), Long.toString(System.nanoTime())); 

    String solrconfig_xml = "<config><luceneMatchVersion>LUCENE_41</luceneMatchVersion><requestHandler name=\"search\" class=\"solr.SearchHandler\" default=\"true\"><lst name=\"defaults\"><str name=\"fl\">*</str><str name=\"df\">id</str></lst></requestHandler><requestHandler name=\"/dataimport\" class=\"org.apache.solr.handler.dataimport.DataImportHandler\"><lst name=\"defaults\"><str name=\"config\">data-config.xml</str></lst></requestHandler></config>"; 

    String data_config_xml = "<dataConfig>" + 
     "<dataSource url=\"jdbc:hsqldb:mem:testdb\" user=\"SA\" driver=\"org.hsqldb.jdbc.JDBCDriver\" />" + 
     "<document>" + 
     "<entity name=\"item\" transformer=\"RegexTransformer\" query=\"SELECT * FROM test\">" + 
     "<field column=\"type\" name=\"type\" splitBy=\",\" />" + 
     "</entity>" + 
     "</document>" + 
     "</dataConfig>"; 

    String schema_xml = "<schema version=\"1.3\" name=\"test\">" + 
     "<types>" + 
     "<fieldType name=\"string\" class=\"solr.StrField\" sortMissingLast=\"true\" omitNorms=\"true\" />" + 
     "</types>" + 
     "<fields>" + 
     "<field stored=\"true\" name=\"id\" type=\"string\" />" + 
     "<field stored=\"true\" name=\"type\" type=\"string\" multiValued=\"true\"/>" + 
     "</fields>" + 
     "<uniqueKey>id</uniqueKey>" + 
     "</schema>"; 

    DataSource getDataSource() { 
    JDBCDataSource ds = new JDBCDataSource(); 
    ds.setUser("SA"); 
    ds.setUrl("mem:testdb"); 
    return ds; 
    } 

    void populateDb(DataSource ds) { 
    QueryRunner runner = new QueryRunner(ds); 
    try { 
     runner.update("DROP TABLE test IF EXISTS"); 
     runner.update("CREATE TABLE test(id INTEGER, type VARCHAR(256));"); 
     runner.update("INSERT INTO test VALUES 1, 'foo,bar,baz'"); 
    } catch (SQLException e) { 
     System.err.println(e); 
    } 
    } 

    void writeSolrConfig() throws IOException { 
    File corePath = new File(configPath, "collection1"); 
    corePath.mkdir(); 
    File confPath = new File(corePath, "conf"); 
    confPath.mkdir(); 
    FileUtils.write(new File(confPath, "data-config.xml"), data_config_xml); 
    FileUtils.write(new File(confPath, "schema.xml"), schema_xml); 
    FileUtils.write(new File(confPath, "solrconfig.xml"), solrconfig_xml); 
    } 

    void startSolr() { 
    System.setProperty("solr.solr.home", configPath.getAbsolutePath()); 
    CoreContainer.Initializer initializer = new CoreContainer.Initializer(); 
    CoreContainer coreContainer = initializer.initialize(); 
    server = new EmbeddedSolrServer(coreContainer, "collection1"); 
    } 

    @Before 
    public void setup() throws IOException { 
    populateDb(getDataSource()); 
    writeSolrConfig(); 
    startSolr(); 
    } 

    @After 
    public void tearDown() { 
    server.shutdown(); 
    FileUtils.deleteQuietly(configPath); 
    } 

    @Test 
    public void testSplitBy() throws Exception { 
    SolrQuery query = new SolrQuery(); 
    query.set("qt", "/dataimport"); 
    query.setParam("command", "full-import"); 
    QueryResponse response = server.query(query); 
    Thread.sleep(500); 

    response = server.query(new SolrQuery("*:*")); 
    for (SolrDocument doc: response.getResults()) { 
     assertNotNull(doc.getFieldValues("type")); 
     assertEquals(3, doc.getFieldValues("type").size()); 
    } 
    } 
} 

而且POM測試用例:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 
    <modelVersion>4.0.0</modelVersion> 
    <groupId>org.example</groupId> 
    <artifactId>solr</artifactId> 
    <version>0.0.1-SNAPSHOT</version> 
    <name>Solr Sanity</name> 

    <build> 
    <plugins> 
     <plugin> 
     <groupId>org.apache.maven.plugins</groupId> 
     <artifactId>maven-compiler-plugin</artifactId> 
     <version>2.3.2</version> 
     <configuration> 
      <source>1.6</source> 
      <target>1.6</target> 
     </configuration> 
     </plugin> 
    </plugins> 
    </build> 

    <dependencies> 
    <dependency> 
     <groupId>org.apache.solr</groupId> 
     <artifactId>solr</artifactId> 
     <version>4.1.0</version> 
     <type>war</type> 
    </dependency> 
    <dependency> 
     <groupId>org.apache.solr</groupId> 
     <artifactId>solr-dataimporthandler</artifactId> 
     <version>4.1.0</version> 
     <type>jar</type> 
    </dependency> 
    <dependency> 
     <groupId>org.apache.solr</groupId> 
     <artifactId>solr-solrj</artifactId> 
     <version>4.1.0</version> 
     <type>jar</type> 
    </dependency> 
    <dependency> 
     <groupId>commons-dbutils</groupId> 
     <artifactId>commons-dbutils</artifactId> 
     <version>1.5</version> 
     <type>jar</type> 
    </dependency> 
    <dependency> 
     <groupId>org.hsqldb</groupId> 
     <artifactId>hsqldb</artifactId> 
     <version>2.2.9</version> 
     <type>jar</type> 
     <scope>runtime</scope> 
    </dependency> 
    <dependency> 
     <groupId>junit</groupId> 
     <artifactId>junit</artifactId> 
     <version>4.11</version> 
    </dependency> 
    <dependency> 
     <groupId>javax.servlet</groupId> 
     <artifactId>servlet-api</artifactId> 
     <version>2.5</version> 
    </dependency> 
    </dependencies> 
</project> 

如何讓這些類型拆分成正確多個值任何瞭解?

+0

可能重複? http://stackoverflow.com/questions/8971476/splitting-multivalued-field-while-importing-data-into-solr –

+0

是的 - 我已經檢查過一個(和其他所有重複的答案建議)。似乎解決方案的唯一區別就是將'sourceColName'屬性添加到問題字段中(我已經嘗試過)。只是似乎有什麼明顯的我失蹤了。是否有某個「splitBy」功能的工作示例? – condit

+0

你在數據庫中有什麼columm?類型或**類型**?根據你的定義,它是類型。這是否與你實際擁有的相匹配? –

回答

4

原來有一對夫婦的問題,這個單元測試:

  1. HSQL的列名是大小寫敏感的(和默認爲大寫)。

  2. 如果Solr字段名稱和db列名稱相同,則還會添加具有整個db值的額外標記。

  3. 字段定義應該是這樣的:

<field column="solrField" splitBy="," sourceColName="TYPE" /> 

而在一般 - 使用RegexTransformer時多值字段混合來自DB單值字段:

  • 如果使用splitBy那麼column屬性是Solr字段的名稱。該sourceColName是數據庫列

  • 如果使用splitBy那麼column屬性數據庫列名和name屬性Solr的領域