如何使用Java将数据从云存储加载到BigQuery
我想上传数据从谷歌云存储到大查询表。下面是我创建作业的代码:如何使用Java将数据从云存储加载到BigQuery,java,google-bigquery,google-cloud-storage,Java,Google Bigquery,Google Cloud Storage,我想上传数据从谷歌云存储到大查询表。下面是我创建作业的代码: public class LoadStorageToBigQuery { // /////////////////////// // USER GENERATED VALUES: you must fill in values specific to your // application. // // Visit the Google API Console to create a Project and generate an
public class LoadStorageToBigQuery {
// ///////////////////////
// USER GENERATED VALUES: you must fill in values specific to your
// application.
//
// Visit the Google API Console to create a Project and generate an
// OAuth 2.0 Client ID and Secret (http://code.google.com/apis/console).
// Then, add the Project ID below, and update the clientsecrets.json file
// with your client_id and client_secret
//
// ///////////////////////
private static final String PROJECT_ID = "gavd.com:compute";
private static final String CLIENTSECRETS_LOCATION = "/client_secrets.json";
private static final String RESOURCE_PATH =
("E:/Work On/ads/Cloud/Dev/Source/BigQueryDemo02" + CLIENTSECRETS_LOCATION).replace(
'/', File.separatorChar);
static GoogleClientSecrets clientSecrets = loadClientSecrets();
// Static variables for API scope, callback URI, and HTTP/JSON functions
private static final List<String> SCOPES = Arrays
.asList("https://www.googleapis.com/auth/bigquery");
private static final String REDIRECT_URI = "urn:ietf:wg:oauth:2.0:oob";
private static final HttpTransport TRANSPORT = new NetHttpTransport();
private static final JsonFactory JSON_FACTORY = new JacksonFactory();
private static GoogleAuthorizationCodeFlow flow = null;
/**
* @param args
* @throws IOException
* @throws InterruptedException
*/
public static void main(String[] args) throws IOException,
InterruptedException {
System.out.println(CLIENTSECRETS_LOCATION);
// Create a new BigQuery client authorized via OAuth 2.0 protocol
Bigquery bigquery = createAuthorizedClient();
// Print out available datasets to the console
listDatasets(bigquery, "publicdata");
JobReference jobId = startQuery(bigquery, PROJECT_ID);
System.out.println("Job ID = " + jobId);
JobReference jobRef = startQuery(bigquery, PROJECT_ID);
checkQueryResults(bigquery, PROJECT_ID, jobRef);
}
/**
* Creates an authorized BigQuery client service using the OAuth 2.0
* protocol
*
* This method first creates a BigQuery authorization URL, then prompts the
* user to visit this URL in a web browser to authorize access. The
* application will wait for the user to paste the resulting authorization
* code at the command line prompt.
*
* @return an authorized BigQuery client
* @throws IOException
*/
public static Bigquery createAuthorizedClient() throws IOException {
String authorizeUrl = new GoogleAuthorizationCodeRequestUrl(
clientSecrets, REDIRECT_URI, SCOPES).setState("").build();
System.out
.println("Paste this URL into a web browser to authorize BigQuery Access:\n"
+ authorizeUrl);
System.out.println("... and type the code you received here: ");
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String authorizationCode = in.readLine();
// Exchange the auth code for an access token and refesh token
Credential credential = exchangeCode(authorizationCode);
return Bigquery.builder(TRANSPORT, JSON_FACTORY)
.setHttpRequestInitializer(credential)
.setApplicationName("Your User Agent Here").build();
}
/**
* Display all BigQuery Datasets associated with a Project
*
* @param bigquery
* an authorized BigQuery client
* @param projectId
* a string containing the current project ID
* @throws IOException
*/
public static void listDatasets(Bigquery bigquery, String projectId)
throws IOException {
Datasets.List datasetRequest = bigquery.datasets().list(projectId);
DatasetList datasetList = datasetRequest.execute();
if (datasetList.getDatasets() != null) {
List<DatasetList.Datasets> datasets = datasetList.getDatasets();
System.out.println("Available datasets\n----------------");
System.out.println( " B = " + datasets.toString());
for (DatasetList.Datasets dataset : datasets) {
System.out.format("%s\n", dataset.getDatasetReference()
.getDatasetId());
}
}
}
public static JobReference startQuery(Bigquery bigquery, String projectId) throws IOException {
Job job = new Job();
JobConfiguration config = new JobConfiguration();
JobConfigurationLoad loadConfig = new JobConfigurationLoad();
config.setLoad(loadConfig);
job.setConfiguration(config);
// Set where you are importing from (i.e. the Google Cloud Storage paths).
List<String> sources = new ArrayList<String>();
sources.add("gs://gms_cloud_project/bigquery_data/06_13_2014/namesbystate.csv");
loadConfig.setSourceUris(sources);
//state:STRING,sex:STRING,year:INTEGER,name:STRING,occurrence:INTEGER
// Describe the resulting table you are importing to:
TableReference tableRef = new TableReference();
tableRef.setDatasetId("gimasys_database");
tableRef.setTableId("table_test");
tableRef.setProjectId(projectId);
loadConfig.setDestinationTable(tableRef);
List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
TableFieldSchema fieldState = new TableFieldSchema();
fieldState.setName("state");
fieldState.setType("STRING");
TableFieldSchema fieldSex = new TableFieldSchema();
fieldSex.setName("sex");
fieldSex.setType("STRING");
TableFieldSchema fieldName = new TableFieldSchema();
fieldName.setName("name");
fieldName.setType("STRING");
TableFieldSchema fieldYear = new TableFieldSchema();
fieldYear.setName("year");
fieldYear.setType("INTEGER");
TableFieldSchema fieldOccur = new TableFieldSchema();
fieldOccur.setName("occurrence");
fieldOccur.setType("INTEGER");
fields.add(fieldState);
fields.add(fieldSex);
fields.add(fieldName);
fields.add(fieldYear);
fields.add(fieldOccur);
TableSchema schema = new TableSchema();
schema.setFields(fields);
loadConfig.setSchema(schema);
// Also set custom delimiter or header rows to skip here....
// [not shown].
Insert insert = bigquery.jobs().insert(projectId, job);
insert.setProjectId(projectId);
JobReference jobRef = insert.execute().getJobReference();
// ... see rest of codelab for waiting for job to complete.
return jobRef;
//return jobId;
}
/**
* Polls the status of a BigQuery job, returns Job reference if "Done"
*
* @param bigquery
* an authorized BigQuery client
* @param projectId
* a string containing the current project ID
* @param jobId
* a reference to an inserted query Job
* @return a reference to the completed Job
* @throws IOException
* @throws InterruptedException
*/
private static Job checkQueryResults(Bigquery bigquery, String projectId,
JobReference jobId) throws IOException, InterruptedException {
// Variables to keep track of total query time
long startTime = System.currentTimeMillis();
long elapsedTime;
while (true) {
Job pollJob = bigquery.jobs().get(projectId, jobId.getJobId())
.execute();
elapsedTime = System.currentTimeMillis() - startTime;
System.out.format("Job status (%dms) %s: %s\n", elapsedTime,
jobId.getJobId(), pollJob.getStatus().getState());
if (pollJob.getStatus().getState().equals("DONE")) {
return pollJob;
}
// Pause execution for one second before polling job status again,
// to
// reduce unnecessary calls to the BigQUery API and lower overall
// application bandwidth.
Thread.sleep(1000);
}
}
/**
* Helper to load client ID/Secret from file.
*
* @return a GoogleClientSecrets object based on a clientsecrets.json
*/
private static GoogleClientSecrets loadClientSecrets() {
try {
System.out.println("A");
System.out.println(CLIENTSECRETS_LOCATION);
GoogleClientSecrets clientSecrets = GoogleClientSecrets.load(new JacksonFactory(),
new FileInputStream(new File(
RESOURCE_PATH)));
return clientSecrets;
} catch (Exception e) {
System.out.println("Could not load file Client_Screts");
e.printStackTrace();
}
return clientSecrets;
}
/**
* Exchange the authorization code for OAuth 2.0 credentials.
*
* @return an authorized Google Auth flow
*/
static Credential exchangeCode(String authorizationCode) throws IOException {
GoogleAuthorizationCodeFlow flow = getFlow();
GoogleTokenResponse response = flow.newTokenRequest(authorizationCode)
.setRedirectUri(REDIRECT_URI).execute();
return flow.createAndStoreCredential(response, null);
}
/**
* Build an authorization flow and store it as a static class attribute.
*
* @return a Google Auth flow object
*/
static GoogleAuthorizationCodeFlow getFlow() {
if (flow == null) {
HttpTransport httpTransport = new NetHttpTransport();
JacksonFactory jsonFactory = new JacksonFactory();
flow = new GoogleAuthorizationCodeFlow.Builder(httpTransport,
jsonFactory, clientSecrets, SCOPES)
.setAccessType("offline").setApprovalPrompt("force")
.build();
}
return flow;
}
如果有任何帮助,我将不胜感激
谢谢所以问题似乎是代码要求“string,string,string,integer,integer”,但提供的数据是“string,string,integer,string,integer”-因此BigQuery无法将第4列中的字符串转换为整数 要获取准确的错误消息,请运行
bq show-j job\u yourjobid
pollJob.getStatus().getState().equals(“完成”)会告诉您作业何时完成,但不会给出退出代码
您应该显式地检查errorresult pollJob.getStatus().getErrorResult()
您可以尝试使用fieldYear.setType(“字符串”)和fieldoccurse.setType(“字符串”)吗?这可能是一个数据一致性问题。好主意,我试过了,看起来还不错。然而,我仍然在思考年份和发生的类型。下面是我想插入到Bigquery中的示例数据,“LA F 1976 Jessica 321”。您如何看待将“INTEGER”数据插入到Bigquery中的方法…:)这就是问题所在!请注意,代码设置变量的顺序与该行不同。是的,我也在考虑这个问题?您认为在BigQuery中插入“Integer”类型的好方法是什么。如果您能告诉我解决此问题的方法,我将不胜感激:)要么将其作为字符串插入并在运行时强制转换(如果需要,将结果保存到新表),要么在定义列时直接设置顺序(请参见下面的答案)。谢谢,我将尝试:)
Job ID = {"jobId":"job_MqfuhuAU1Ms0GIOSbiePFGlc6TE","projectId":"ads.com:compute"}
Job status (451ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (2561ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (6812ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (8273ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (9695ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (11146ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (12466ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (13948ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (15392ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (16796ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: PENDING
Job status (18296ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: RUNNING
Job status (19755ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: RUNNING
Job status (21587ms) job_fOtciwR1pfytbkeMaQ9RvvH18qc: DONE
while (true) {
Job pollJob = getBigQuery().jobs().get(projectId, jobId.getJobId()).execute();
elapsedTime = System.currentTimeMillis() - startTime;
if (pollJob.getStatus().getErrorResult() != null) {
// The job ended with an error.
System.out.format("Job %s ended with error %s", jobId.getJobId(),pollJob.getStatus().getErrorResult().getMessage(), projectId);
throw new RuntimeException(String.format("Job %s ended with error %s", jobId.getJobId(),
pollJob.getStatus().getErrorResult().getMessage()));
}
System.out.format("Job status (%dms) %s: %s\n", elapsedTime,
jobId.getJobId(), pollJob.getStatus().getState());
if (pollJob.getStatus().getState().equals("DONE")) {
break;
}
Thread.sleep(5000);
}