Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/mysql/67.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
使用JAVA web爬虫在MYSQL中存储印地语单词_Mysql_Unicode_Jdbc_Hindi - Fatal编程技术网

使用JAVA web爬虫在MYSQL中存储印地语单词

使用JAVA web爬虫在MYSQL中存储印地语单词,mysql,unicode,jdbc,hindi,Mysql,Unicode,Jdbc,Hindi,我想在MySQL数据库中存储一些印地语单词。为此,我写了一个网络爬虫。我能够从HTML页面成功地读取这些单词,并在NetBeans控制台中显示它们。但当我将它们插入MySQL时,它们会变为???????。另外,如果我在PHPMyAdmin本身中使用SQL查询插入相同的单词,它们将被正确存储 我已经搜索了谷歌和各种论坛很多,并已采取适当的预防措施,在处理Unicode在大多数地方。如果输入Unicode,是否必须在SQL语句(JDBC)中明确提及 这是我的全部代码 import java.io.*

我想在MySQL数据库中存储一些印地语单词。为此,我写了一个网络爬虫。我能够从HTML页面成功地读取这些单词,并在NetBeans控制台中显示它们。但当我将它们插入MySQL时,它们会变为???????。另外,如果我在PHPMyAdmin本身中使用SQL查询插入相同的单词,它们将被正确存储

我已经搜索了谷歌和各种论坛很多,并已采取适当的预防措施,在处理Unicode在大多数地方。如果输入Unicode,是否必须在SQL语句(JDBC)中明确提及

这是我的全部代码

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.*;

public class TestDataParsing2 {
  public int counter = 1;
  private String ID = "";
  private String title = "";
  private String owner = "";
  private String s="";

  private Connection conn = null;
  private String url = "jdbc:mysql://localhost:3306";
  private String dbName = "/hindi-eng";
  private String driver = "com.mysql.jdbc.Driver";
  private String userName = "root";
  private String password = "";
  private String TABLE = "dict";

private void initdb(){
  try {
    Class.forName(driver).newInstance();
    conn = DriverManager.getConnection(url+dbName,userName,password);
  } catch (Exception e) {
    e.printStackTrace();
  }
}

private void closedb(){
  try {
    conn.close();
  } catch (Exception e) {
    e.printStackTrace();
  }
}
public void process(String content){
  try{
    BufferedReader reader =  new BufferedReader(new StringReader(content));
    String text = "";
    boolean start1 = false;
    boolean start2 = false;
    while (( text = reader.readLine()) != null) {
      if(text.contains("\"a")) {
        System.out.println("______________________________________________________________");
        String id = getID(text);
        this.ID = id;
        String title = getTitle(text);
        this.title = title;
        String owner = getOwner(text);
        this.owner = owner;
        start1 = true;
      }
      if(start1 && start2) {
        String s = getS(text);
        this.s = s;
        counter++;
        insert();
        start2=false;
        start1= false;
      }
      if(start1) {
        start2= true;
      }
    }
  }catch(Exception e){
    System.out.println(e);
  }
}

public void insert(){
  String insertString = "INSERT INTO " + TABLE + " VALUES (" + this.counter + ",'" +
    this.ID + "','" + this.title + "','" + this.owner + "','" + this.s + "')";
  System.out.println(insertString);
  try {
    Statement stmt = conn.createStatement();
    stmt.executeUpdate(insertString);
    stmt.close();
  } catch(Exception e) {
    System.out.println(e);
  }
}

public String getID(String text){
  String id = "";
  id = text.substring(text.indexOf("\"")+1, text.indexOf("\","));
  return id;
}

public String getTitle(String text){
  String title = "";
  title = text.substring(text.indexOf(",\"")+2, text.indexOf("\",\"1."));
  return title;
}

public String getOwner(String text){
  try{
    String owner = "";
    owner = text.substring(text.indexOf("\",\"1.")+5, text.indexOf("\"<br>"));
    int i;
    for(i=0;i<owner.length();i++) {
      String fifthChar = "\u00AE";
      int codePoint = owner.codePointAt(i);
    }
    return owner;
  } catch(Exception e) {
    System.out.println(e);
    System.out.println("eeee");
  }
  return owner;
}

public String getS(String text){
  String s = "";
  s = text.substring(0, text.indexOf("<br>"));
  return s;
}

public String download(String path) {
  String result = "";
  try {
    URL url = new URL(path);
    URLConnection conn = url.openConnection();
    conn.setDoOutput(true);
    InputStream in = null;
    in = url.openStream();
    String content = pipe(in,"utf-8");
    result = content;
  } catch (Exception e) {
    e.printStackTrace();
  }
  return result;
}

public String pipe(InputStream in,String charset) throws IOException {
  StringBuffer s = new StringBuffer();
  if(charset==null||"".equals(charset)){
    charset="utf-8";
  }
  String rLine = null;
  BufferedReader bReader = new BufferedReader(new InputStreamReader(in,"UTF-8"));
  FileOutputStream("C:\\Research\\MiningSoftwareRepositories\\Traceability-Link-Recovery\\EXPERIMENTS\\BR\\"
    + bugid + ".txt");
  while ( (rLine = bReader.readLine()) != null) {
    String tmp_rLine = rLine;
    s.append(tmp_rLine+"\n");
  }
  tmp_rLine = null;
}
  in.close();
  return s.toString();
}

public static void main(String[] args) {
  TestDataParsing2 tdp = new TestDataParsing2();
  tdp.initdb();
  System.out.println("process started");
  String urlPath = "file:///C:/Users/Abhinav/Downloads/Compressed/eng-hindi-dict-utf8/sa.htm";
  String content = tdp.download(urlPath);
  tdp.process(content);
  tdp.closedb();
}
import java.io.*;
导入java.net.URL;
导入java.net.URLConnection;
导入java.sql.Connection;
导入java.sql.DriverManager;
导入java.sql.Statement;
导入java.util.*;
公共类TestDataParsing2{
公共整数计数器=1;
私有字符串ID=“”;
私有字符串title=“”;
私有字符串所有者=”;
私有字符串s=“”;
专用连接conn=null;
私有字符串url=“jdbc:mysql://localhost:3306";
私有字符串dbName=“/hindi eng”;
私有字符串driver=“com.mysql.jdbc.driver”;
私有字符串userName=“root”;
私有字符串密码=”;
私有字符串TABLE=“dict”;
私有void initdb(){
试一试{
Class.forName(driver.newInstance();
conn=DriverManager.getConnection(url+dbName、用户名、密码);
}捕获(例外e){
e、 printStackTrace();
}
}
私有void closedb(){
试一试{
康涅狄格州关闭();
}捕获(例外e){
e、 printStackTrace();
}
}
公共作废处理(字符串内容){
试一试{
BufferedReader=新BufferedReader(新StringReader(内容));
字符串文本=”;
布尔start1=false;
布尔start2=false;
而((text=reader.readLine())!=null){
if(text.contains(“\”a”)){
系统.out.println(“uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu;
字符串id=getID(文本);
this.ID=ID;
字符串标题=getTitle(文本);
this.title=标题;
字符串所有者=getOwner(文本);
this.owner=所有者;
start1=真;
}
如果(开始1和开始2){
字符串s=get(文本);
这个.s=s;
计数器++;
插入();
start2=假;
start1=假;
}
如果(start1){
start2=真;
}
}
}捕获(例外e){
系统输出打印ln(e);
}
}
公开作废插入(){
String insertString=“插入“+TABLE+”值(“+this.counter+”,”+
this.ID+”、“+this.title+”、“+this.owner+”、“+this.s+”)”;
System.out.println(insertString);
试一试{
语句stmt=conn.createStatement();
stmt.executeUpdate(插入字符串);
stmt.close();
}捕获(例外e){
系统输出打印ln(e);
}
}
公共字符串getID(字符串文本){
字符串id=“”;
id=text.substring(text.indexOf(“\”)+1,text.indexOf(“\”,”);
返回id;
}
公共字符串getTitle(字符串文本){
字符串标题=”;
title=text.substring(text.indexOf(“,\”)+2,text.indexOf(“,”1”);
返回标题;
}
公共字符串getOwner(字符串文本){
试一试{
字符串所有者=”;
所有者=text.substring(text.indexOf(“\”,“\”1”)+5,text.indexOf(“\”
”); int i;
对于(i=0;i您没有指定数据库连接编码,因此使用服务器默认编码。看起来服务器没有配置为使用UTF-8

您可以将设置为UTF-8,或设置连接的属性:

conn = DriverManager.getConnection(url+dbName+"?characterEncoding=UTF-8",userName,password);

请记住,连接url的语法是“jdbc:mysql://host:port/database?option1=value1&option2=value2&...“

这很有效。我的意思是我知道我们必须使用它,但我把语法弄错了。但是你的最后一行今天创造了纽约。非常感谢。干杯。