2012-07-21 22 views
0

我想在MySQL數據庫中存儲一些印地語詞。爲此我寫了一個網絡爬蟲。我能夠從HTML頁面成功讀取這些單詞,並將它們顯示在NetBeans控制檯中。但是當我在MySQL中插入它們時,它們變成了???????。另外,如果我在PHPMyAdmin中使用SQL查詢插入相同的單詞,它們會正確存儲。在MYSQL中使用JAVA網絡爬蟲存儲印地文詞

我在Google和各種論壇上搜索了很多,並且在大多數地方處理Unicode時都採取了適當的預防措施。我們是否必須在SQL語句(JDBC)中明確提及是否輸入Unicode?

這是我的整個代碼。

import java.io.*; 
import java.net.URL; 
import java.net.URLConnection; 
import java.sql.Connection; 
import java.sql.DriverManager; 
import java.sql.Statement; 
import java.util.*; 

public class TestDataParsing2 { 
    public int counter = 1; 
    private String ID = ""; 
    private String title = ""; 
    private String owner = ""; 
    private String s=""; 

    private Connection conn = null; 
    private String url = "jdbc:mysql://localhost:3306"; 
    private String dbName = "/hindi-eng"; 
    private String driver = "com.mysql.jdbc.Driver"; 
    private String userName = "root"; 
    private String password = ""; 
    private String TABLE = "dict"; 

private void initdb(){ 
    try { 
    Class.forName(driver).newInstance(); 
    conn = DriverManager.getConnection(url+dbName,userName,password); 
    } catch (Exception e) { 
    e.printStackTrace(); 
    } 
} 

private void closedb(){ 
    try { 
    conn.close(); 
    } catch (Exception e) { 
    e.printStackTrace(); 
    } 
} 
public void process(String content){ 
    try{ 
    BufferedReader reader = new BufferedReader(new StringReader(content)); 
    String text = ""; 
    boolean start1 = false; 
    boolean start2 = false; 
    while ((text = reader.readLine()) != null) { 
     if(text.contains("\"a")) { 
     System.out.println("______________________________________________________________"); 
     String id = getID(text); 
     this.ID = id; 
     String title = getTitle(text); 
     this.title = title; 
     String owner = getOwner(text); 
     this.owner = owner; 
     start1 = true; 
     } 
     if(start1 && start2) { 
     String s = getS(text); 
     this.s = s; 
     counter++; 
     insert(); 
     start2=false; 
     start1= false; 
     } 
     if(start1) { 
     start2= true; 
     } 
    } 
    }catch(Exception e){ 
    System.out.println(e); 
    } 
} 

public void insert(){ 
    String insertString = "INSERT INTO " + TABLE + " VALUES (" + this.counter + ",'" + 
    this.ID + "','" + this.title + "','" + this.owner + "','" + this.s + "')"; 
    System.out.println(insertString); 
    try { 
    Statement stmt = conn.createStatement(); 
    stmt.executeUpdate(insertString); 
    stmt.close(); 
    } catch(Exception e) { 
    System.out.println(e); 
    } 
} 

public String getID(String text){ 
    String id = ""; 
    id = text.substring(text.indexOf("\"")+1, text.indexOf("\",")); 
    return id; 
} 

public String getTitle(String text){ 
    String title = ""; 
    title = text.substring(text.indexOf(",\"")+2, text.indexOf("\",\"1.")); 
    return title; 
} 

public String getOwner(String text){ 
    try{ 
    String owner = ""; 
    owner = text.substring(text.indexOf("\",\"1.")+5, text.indexOf("\"<br>")); 
    int i; 
    for(i=0;i<owner.length();i++) { 
     String fifthChar = "\u00AE"; 
     int codePoint = owner.codePointAt(i); 
    } 
    return owner; 
    } catch(Exception e) { 
    System.out.println(e); 
    System.out.println("eeee"); 
    } 
    return owner; 
} 

public String getS(String text){ 
    String s = ""; 
    s = text.substring(0, text.indexOf("<br>")); 
    return s; 
} 

public String download(String path) { 
    String result = ""; 
    try { 
    URL url = new URL(path); 
    URLConnection conn = url.openConnection(); 
    conn.setDoOutput(true); 
    InputStream in = null; 
    in = url.openStream(); 
    String content = pipe(in,"utf-8"); 
    result = content; 
    } catch (Exception e) { 
    e.printStackTrace(); 
    } 
    return result; 
} 

public String pipe(InputStream in,String charset) throws IOException { 
    StringBuffer s = new StringBuffer(); 
    if(charset==null||"".equals(charset)){ 
    charset="utf-8"; 
    } 
    String rLine = null; 
    BufferedReader bReader = new BufferedReader(new InputStreamReader(in,"UTF-8")); 
    FileOutputStream("C:\\Research\\MiningSoftwareRepositories\\Traceability-Link-Recovery\\EXPERIMENTS\\BR\\" 
    + bugid + ".txt"); 
    while ((rLine = bReader.readLine()) != null) { 
    String tmp_rLine = rLine; 
    s.append(tmp_rLine+"\n"); 
    } 
    tmp_rLine = null; 
} 
    in.close(); 
    return s.toString(); 
} 

public static void main(String[] args) { 
    TestDataParsing2 tdp = new TestDataParsing2(); 
    tdp.initdb(); 
    System.out.println("process started"); 
    String urlPath = "file:///C:/Users/Abhinav/Downloads/Compressed/eng-hindi-dict-utf8/sa.htm"; 
    String content = tdp.download(urlPath); 
    tdp.process(content); 
    tdp.closedb(); 
} 

而且當我添加 「?了useUnicode = YES &的characterEncoding = UTF-8」 到conn_url我收到以下錯誤,這不是沒有它的到來。

java.sql.SQLException:不支持的字符編碼'UTF-8/hindi-eng'。 過程在com.mysql.jdbc.SQLError.createSQLException(SQLError.java:1074) 開始 在com.mysql.jdbc.SQLError.createSQLException(SQLError.java:988) 在com.mysql.jdbc.SQLError.createSQLException (SQLError.java:974) at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:919) at com.mysql.jdbc.StringUtils.getBytes(StringUtils.java:574) at com.mysql.jdbc .StringUtils.getBytes(StringUtils.java:719) 在com.mysql.jdbc.Buffer.writeStringNoNull(Buffer.java:704) 在com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2573) 在玉米.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2713) at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2 663) 在com.mysql.jdbc.StatementImpl.executeQuery(StatementImpl.java:1599) 在com.mysql.jdbc.ConnectionImpl.loadServerVariables(ConnectionImpl.java:3928) 在com.mysql.jdbc.ConnectionImpl.initializePropsFromServer( ConnectionImpl.java:3473) 在com.mysql.jdbc.ConnectionImpl.connectOneTryOnly(ConnectionImpl.java:2445) 在com.mysql.jdbc.ConnectionImpl.createNewIO(ConnectionImpl.java:2215) 在com.mysql.jdbc。 ConnectionImpl(ConnectionImpl.java:813) 在com.mysql.jdbc.JDBC4Connection。(JDBC4Connection.java:47) 在sun.reflect.NativeConstructorAccessorImpl.newInstance0(本機方法) 在sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl .java:57) at su n.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) 在java.lang.reflect.Constructor.newInstance(Constructor.java:525) 在com.mysql.jdbc.Util.handleNewInstance(Util.java:411) at com.mysql.jdbc.ConnectionImpl.getInstance(ConnectionImpl.java:399) at com.mysql.jdbc.NonRegisteringDriver.connect(NonRegisteringDriver.java:334) at java.sql.DriverManager.getConnection(DriverManager.java: 579) 在java.sql.DriverManager.getConnection(DriverManager.java:221) 在TestDataParsing2.initdb(TestDataParsing2.java:29) 在TestDataParsing2.main(TestDataParsing2.java:239)


INSERT INTO dict VALUES(2,'a','Art','एक','我買了一支筆。') java.lang。NullPointerException異常 _ __ _ __ _ __ _ __ _ __ _ __ _ __ _ __ _ __ _ __ _ __ _ INSERT INTO dict VALUES(3,'aback','Adv','पीछे/हतप्रभ','I was some ') java.lang.NullPointerException

回答

2

您不指定數據庫連接編碼,因此使用了服務器默認編碼。它看起來像服務器沒有配置爲使用UTF-8。

您可以設置server encoding爲UTF-8,或者設置連接上characterEncoding屬性:

conn = DriverManager.getConnection(url+dbName+"?characterEncoding=UTF-8",userName,password); 

記住連接的URL語法是「的jdbc:mysql的://主機:端口/ database?opt​​ion1 = value1 & option2 = value2 & ...「。

+0

工作。我的意思是我知道我們必須使用它,但我正在混合的語法。但今天的最後一行是在今天。非常感謝。乾杯。 – Abhinav 2012-07-21 09:36:58