如何使用Java将数据从云存储加载到BigQuery

如何使用Java将数据从云存储加载到BigQuery,java,google-bigquery,google-cloud-storage,Java,Google Bigquery,Google Cloud Storage,我想上传数据从谷歌云存储到大查询表。下面是我创建作业的代码: public class LoadStorageToBigQuery { // /////////////////////// // USER GENERATED VALUES: you must fill in values specific to your // application. // // Visit the Google API Console to create a Project and generate an

我想上传数据从谷歌云存储到大查询表。下面是我创建作业的代码:

public class LoadStorageToBigQuery {

// ///////////////////////
// USER GENERATED VALUES: you must fill in values specific to your
// application.
//
// Visit the Google API Console to create a Project and generate an
// OAuth 2.0 Client ID and Secret (http://code.google.com/apis/console).
// Then, add the Project ID below, and update the clientsecrets.json file
// with your client_id and client_secret
//
// ///////////////////////
private static final String PROJECT_ID = "gavd.com:compute";
private static final String CLIENTSECRETS_LOCATION = "/client_secrets.json";
 private static final String RESOURCE_PATH =
          ("E:/Work On/ads/Cloud/Dev/Source/BigQueryDemo02" + CLIENTSECRETS_LOCATION).replace(
              '/', File.separatorChar);
static GoogleClientSecrets clientSecrets = loadClientSecrets();

// Static variables for API scope, callback URI, and HTTP/JSON functions
private static final List<String> SCOPES = Arrays
        .asList("https://www.googleapis.com/auth/bigquery");
private static final String REDIRECT_URI = "urn:ietf:wg:oauth:2.0:oob";
private static final HttpTransport TRANSPORT = new NetHttpTransport();
private static final JsonFactory JSON_FACTORY = new JacksonFactory();

private static GoogleAuthorizationCodeFlow flow = null;

/**
 * @param args
 * @throws IOException
 * @throws InterruptedException
 */
public static void main(String[] args) throws IOException,
        InterruptedException {
    System.out.println(CLIENTSECRETS_LOCATION);
    // Create a new BigQuery client authorized via OAuth 2.0 protocol
    Bigquery bigquery = createAuthorizedClient();

    // Print out available datasets to the console
    listDatasets(bigquery, "publicdata");
    JobReference jobId = startQuery(bigquery, PROJECT_ID);
    System.out.println("Job ID = " + jobId); 
    JobReference jobRef = startQuery(bigquery, PROJECT_ID);
    checkQueryResults(bigquery, PROJECT_ID, jobRef);

}

/**
 * Creates an authorized BigQuery client service using the OAuth 2.0
 * protocol
 * 
 * This method first creates a BigQuery authorization URL, then prompts the
 * user to visit this URL in a web browser to authorize access. The
 * application will wait for the user to paste the resulting authorization
 * code at the command line prompt.
 * 
 * @return an authorized BigQuery client
 * @throws IOException
 */
public static Bigquery createAuthorizedClient() throws IOException {

    String authorizeUrl = new GoogleAuthorizationCodeRequestUrl(
            clientSecrets, REDIRECT_URI, SCOPES).setState("").build();

    System.out
            .println("Paste this URL into a web browser to authorize BigQuery Access:\n"
                    + authorizeUrl);

    System.out.println("... and type the code you received here: ");
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    String authorizationCode = in.readLine();

    // Exchange the auth code for an access token and refesh token
    Credential credential = exchangeCode(authorizationCode);

    return Bigquery.builder(TRANSPORT, JSON_FACTORY)
            .setHttpRequestInitializer(credential)
            .setApplicationName("Your User Agent Here").build();
}

/**
 * Display all BigQuery Datasets associated with a Project
 * 
 * @param bigquery
 *            an authorized BigQuery client
 * @param projectId
 *            a string containing the current project ID
 * @throws IOException
 */
public static void listDatasets(Bigquery bigquery, String projectId)
        throws IOException {
    Datasets.List datasetRequest = bigquery.datasets().list(projectId);
    DatasetList datasetList = datasetRequest.execute();
    if (datasetList.getDatasets() != null) {
        List<DatasetList.Datasets> datasets = datasetList.getDatasets();
        System.out.println("Available datasets\n----------------");
        System.out.println( " B = " + datasets.toString());
        for (DatasetList.Datasets dataset : datasets) {
            System.out.format("%s\n", dataset.getDatasetReference()
                    .getDatasetId());
        }
    }
}


public static JobReference startQuery(Bigquery bigquery, String projectId) throws IOException {

    Job job = new Job();
    JobConfiguration config = new JobConfiguration();
    JobConfigurationLoad loadConfig = new JobConfigurationLoad();
    config.setLoad(loadConfig);

    job.setConfiguration(config);

    // Set where you are importing from (i.e. the Google Cloud Storage paths).
    List<String> sources = new ArrayList<String>();
    sources.add("gs://gms_cloud_project/bigquery_data/06_13_2014/namesbystate.csv");
    loadConfig.setSourceUris(sources);
    //state:STRING,sex:STRING,year:INTEGER,name:STRING,occurrence:INTEGER
    // Describe the resulting table you are importing to:
    TableReference tableRef = new TableReference();
    tableRef.setDatasetId("gimasys_database");
    tableRef.setTableId("table_test");
    tableRef.setProjectId(projectId);
    loadConfig.setDestinationTable(tableRef);

    List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
    TableFieldSchema fieldState = new TableFieldSchema();
    fieldState.setName("state");
    fieldState.setType("STRING");
    TableFieldSchema fieldSex = new TableFieldSchema();
    fieldSex.setName("sex");
    fieldSex.setType("STRING");
    TableFieldSchema fieldName = new TableFieldSchema();
    fieldName.setName("name");
    fieldName.setType("STRING");
    TableFieldSchema fieldYear = new TableFieldSchema();
    fieldYear.setName("year");
    fieldYear.setType("INTEGER");
    TableFieldSchema fieldOccur = new TableFieldSchema();
    fieldOccur.setName("occurrence");
    fieldOccur.setType("INTEGER");
    fields.add(fieldState);
    fields.add(fieldSex);
    fields.add(fieldName);
    fields.add(fieldYear);
    fields.add(fieldOccur);
    TableSchema schema = new TableSchema();
    schema.setFields(fields);
    loadConfig.setSchema(schema);

    // Also set custom delimiter or header rows to skip here....
    // [not shown].

    Insert insert = bigquery.jobs().insert(projectId, job);
    insert.setProjectId(projectId);
    JobReference jobRef =  insert.execute().getJobReference();

    // ... see rest of codelab for waiting for job to complete.
    return jobRef;
    //return jobId;
}

/**
 * Polls the status of a BigQuery job, returns Job reference if "Done"
 * 
 * @param bigquery
 *            an authorized BigQuery client
 * @param projectId
 *            a string containing the current project ID
 * @param jobId
 *            a reference to an inserted query Job
 * @return a reference to the completed Job
 * @throws IOException
 * @throws InterruptedException
 */
private static Job checkQueryResults(Bigquery bigquery, String projectId,
        JobReference jobId) throws IOException, InterruptedException {
    // Variables to keep track of total query time
    long startTime = System.currentTimeMillis();
    long elapsedTime;

    while (true) {
        Job pollJob = bigquery.jobs().get(projectId, jobId.getJobId())
                .execute();
        elapsedTime = System.currentTimeMillis() - startTime;
        System.out.format("Job status (%dms) %s: %s\n", elapsedTime,
                jobId.getJobId(), pollJob.getStatus().getState());
        if (pollJob.getStatus().getState().equals("DONE")) {
            return pollJob;
        }
        // Pause execution for one second before polling job status again,
        // to
        // reduce unnecessary calls to the BigQUery API and lower overall
        // application bandwidth.
        Thread.sleep(1000);
    }
}

/**
 * Helper to load client ID/Secret from file.
 * 
 * @return a GoogleClientSecrets object based on a clientsecrets.json
 */
private static GoogleClientSecrets loadClientSecrets() {
    try {
        System.out.println("A");
        System.out.println(CLIENTSECRETS_LOCATION);
        GoogleClientSecrets clientSecrets = GoogleClientSecrets.load(new JacksonFactory(),
                    new FileInputStream(new File(
                        RESOURCE_PATH)));
        return clientSecrets;
    } catch (Exception e) {
        System.out.println("Could not load file Client_Screts");
        e.printStackTrace();
    }
    return clientSecrets;
}

/**
 * Exchange the authorization code for OAuth 2.0 credentials.
 * 
 * @return an authorized Google Auth flow
 */
static Credential exchangeCode(String authorizationCode) throws IOException {
    GoogleAuthorizationCodeFlow flow = getFlow();
    GoogleTokenResponse response = flow.newTokenRequest(authorizationCode)
            .setRedirectUri(REDIRECT_URI).execute();
    return flow.createAndStoreCredential(response, null);
}

/**
 * Build an authorization flow and store it as a static class attribute.
 * 
 * @return a Google Auth flow object
 */
static GoogleAuthorizationCodeFlow getFlow() {
    if (flow == null) {
        HttpTransport httpTransport = new NetHttpTransport();
        JacksonFactory jsonFactory = new JacksonFactory();

        flow = new GoogleAuthorizationCodeFlow.Builder(httpTransport,
                jsonFactory, clientSecrets, SCOPES)
                .setAccessType("offline").setApprovalPrompt("force")
                .build();
    }
    return flow;
}
如果有任何帮助,我将不胜感激


谢谢

所以问题似乎是代码要求“string,string,string,integer,integer”,但提供的数据是“string,string,integer,string,integer”-因此BigQuery无法将第4列中的字符串转换为整数

要获取准确的错误消息,请运行
bq show-j job\u yourjobid

pollJob.getStatus().getState().equals(“完成”)会告诉您作业何时完成,但不会给出退出代码

您应该显式地检查errorresult pollJob.getStatus().getErrorResult()


您可以尝试使用fieldYear.setType(“字符串”)和fieldoccurse.setType(“字符串”)吗?这可能是一个数据一致性问题。好主意,我试过了,看起来还不错。然而,我仍然在思考年份和发生的类型。下面是我想插入到Bigquery中的示例数据,“LA F 1976 Jessica 321”。您如何看待将“INTEGER”数据插入到Bigquery中的方法…:)这就是问题所在!请注意,代码设置变量的顺序与该行不同。是的,我也在考虑这个问题?您认为在BigQuery中插入“Integer”类型的好方法是什么。如果您能告诉我解决此问题的方法,我将不胜感激:)要么将其作为字符串插入并在运行时强制转换(如果需要,将结果保存到新表),要么在定义列时直接设置顺序(请参见下面的答案)。谢谢,我将尝试:)
Job ID = {"jobId":"job_MqfuhuAU1Ms0GIOSbiePFGlc6TE","projectId":"ads.com:compute"}
Job status (451ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (2561ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (6812ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (8273ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (9695ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (11146ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (12466ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (13948ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (15392ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (16796ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (18296ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: RUNNING
Job status (19755ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: RUNNING
Job status (21587ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: DONE
    while (true) {
     Job pollJob = getBigQuery().jobs().get(projectId, jobId.getJobId()).execute();
     elapsedTime = System.currentTimeMillis() - startTime;
     if (pollJob.getStatus().getErrorResult() != null) {
        // The job ended with an error.
         System.out.format("Job %s ended with error %s", jobId.getJobId(),pollJob.getStatus().getErrorResult().getMessage(), projectId);
         throw new RuntimeException(String.format("Job %s ended with error %s", jobId.getJobId(), 
                 pollJob.getStatus().getErrorResult().getMessage()));       
     }       
     System.out.format("Job status (%dms) %s: %s\n", elapsedTime,
       jobId.getJobId(), pollJob.getStatus().getState());

     if (pollJob.getStatus().getState().equals("DONE")) {
       break;
     }
     Thread.sleep(5000);
    }