使用Apache Beam';s Python SDK

使用Apache Beam';s Python SDK,python,apache-beam,Python,Apache Beam,我有一个包含字符串的PCollection。我想按空格分割每个字符串,找到最大大小为的令牌列表,并将大小作为int存储在变量中 考虑以下示例输入: 句子=['这是第一句话', “第二句话”, “还有一句话”] 将beam.Pipeline(options=PipelineOptions())作为p: pcoll=p |“Create”>>beam.Create(句子) 拆分后的句子为: ['This', 'is', 'the', 'first', 'sentence'] -> 5 ['S

我有一个包含字符串的PCollection。我想按空格分割每个字符串,找到最大大小为的令牌列表,并将大小作为
int
存储在变量中

考虑以下示例输入:

句子=['这是第一句话',
“第二句话”,
“还有一句话”]
将beam.Pipeline(options=PipelineOptions())作为p:
pcoll=p |“Create”>>beam.Create(句子)
拆分后的句子为:

['This', 'is', 'the', 'first', 'sentence'] -> 5
['Second', 'sentence'] -> 2
['Yet', 'another', 'sentence'] -> 3
我想将值
5
存储在变量中

我该怎么做呢?我遇到过,但这并不完全符合我的目的。作者正在打印结果PCollection,但我希望稍后在管道的其他阶段使用此值

您可以使用变换来执行此操作。简单地说,我们将每个句子拆分,然后计算标记长度。使用
Top
我们只需要排名第一的结果,并传递lambda函数作为比较标准,以按字长对它们进行排序:

sentences = sentences = ['This is the first sentence',
       'Second sentence',
       'Yet another sentence']

longest_sentence = (p
  | 'Read Sentences' >> beam.Create(sentences)
  | 'Split into Words' >> beam.Map(lambda x: x.split(' '))
  | 'Map Token Length'      >> beam.Map(lambda x: (x, len(x)))
  | 'Top Sentence' >> combine.Top.Of(1, lambda a,b: a[1]<b[1])
  | 'Save Variable'         >> beam.ParDo(SaveMaxFn()))
length
是一个全局变量:

global length
结果:

INFO:root:Longest sentence: 5 token(s)
完整代码:

import argparse, logging

import apache_beam as beam
import apache_beam.transforms.combiners as combine
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions


class SaveMaxFn(beam.DoFn):
  """Stores max in global variables"""
  def process(self, element):
    length = element[0][1]
    logging.info("Longest sentence: %s token(s)", length)

    return element


def run(argv=None):
  parser = argparse.ArgumentParser()
  known_args, pipeline_args = parser.parse_known_args(argv)

  global length

  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  sentences = sentences = ['This is the first sentence',
             'Second sentence',
             'Yet another sentence']

  longest_sentence = (p
    | 'Read Sentences' >> beam.Create(sentences)
    | 'Split into Words' >> beam.Map(lambda x: x.split(' '))
    | 'Map Token Length'      >> beam.Map(lambda x: (x, len(x)))
    | 'Top Sentence' >> combine.Top.Of(1, lambda a,b: a[1]<b[1])
    | 'Save Variable'         >> beam.ParDo(SaveMaxFn()))

  result = p.run()
  result.wait_until_finish()

if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  run()
import argparse,日志记录
将apache_梁作为梁导入
将apache_beam.transforms.combiners作为合并导入
从apache_beam.options.pipeline_options导入PipelineOptions
从apache_beam.options.pipeline_options导入设置选项
类SaveMaxFn(beam.DoFn):
“”“在全局变量中存储最大值”“”
def流程(自身、要素):
长度=元素[0][1]
logging.info(“最长句子:%s标记”,长度)
返回元素
def运行(argv=None):
parser=argparse.ArgumentParser()
已知参数,管道参数=解析器。解析已知参数(argv)
全局长度
管道选项=管道选项(管道参数)
管道选项。查看为(设置选项)。保存主会话=真
p=梁.管道(选项=管道\选项)
句子=句子=[“这是第一句话”,
“第二句话”,
“还有一句话”]
最长句子=(p
|“读句子”>>beam.Create(句子)
|“拆分为单词”>>beam.Map(lambda x:x.Split(“”))
|'Map Token Length'>>beam.Map(lambda x:(x,len(x)))
|'Top-statement'>>combine.Top.Of(1,lambda a,b:a[1]>beam.ParDo(SaveMaxFn())
结果=p.运行()
结果。等待直到完成()
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
logging.getLogger().setLevel(logging.INFO)
运行()
您可以使用转换进行转换。简单地说,我们拆分每个句子,然后计算标记长度。使用
Top
我们只需要第一个结果,并传递lambda函数作为比较标准,以按字长对其进行排序:

sentences = sentences = ['This is the first sentence',
       'Second sentence',
       'Yet another sentence']

longest_sentence = (p
  | 'Read Sentences' >> beam.Create(sentences)
  | 'Split into Words' >> beam.Map(lambda x: x.split(' '))
  | 'Map Token Length'      >> beam.Map(lambda x: (x, len(x)))
  | 'Top Sentence' >> combine.Top.Of(1, lambda a,b: a[1]<b[1])
  | 'Save Variable'         >> beam.ParDo(SaveMaxFn()))
length
是一个全局变量:

global length
结果:

INFO:root:Longest sentence: 5 token(s)
完整代码:

import argparse, logging

import apache_beam as beam
import apache_beam.transforms.combiners as combine
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions


class SaveMaxFn(beam.DoFn):
  """Stores max in global variables"""
  def process(self, element):
    length = element[0][1]
    logging.info("Longest sentence: %s token(s)", length)

    return element


def run(argv=None):
  parser = argparse.ArgumentParser()
  known_args, pipeline_args = parser.parse_known_args(argv)

  global length

  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  sentences = sentences = ['This is the first sentence',
             'Second sentence',
             'Yet another sentence']

  longest_sentence = (p
    | 'Read Sentences' >> beam.Create(sentences)
    | 'Split into Words' >> beam.Map(lambda x: x.split(' '))
    | 'Map Token Length'      >> beam.Map(lambda x: (x, len(x)))
    | 'Top Sentence' >> combine.Top.Of(1, lambda a,b: a[1]<b[1])
    | 'Save Variable'         >> beam.ParDo(SaveMaxFn()))

  result = p.run()
  result.wait_until_finish()

if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  run()
import argparse,日志记录
将apache_梁作为梁导入
将apache_beam.transforms.combiners作为合并导入
从apache_beam.options.pipeline_options导入PipelineOptions
从apache_beam.options.pipeline_options导入设置选项
类SaveMaxFn(beam.DoFn):
“”“在全局变量中存储最大值”“”
def流程(自身、要素):
长度=元素[0][1]
logging.info(“最长句子:%s标记”,长度)
返回元素
def运行(argv=None):
parser=argparse.ArgumentParser()
已知参数,管道参数=解析器。解析已知参数(argv)
全局长度
管道选项=管道选项(管道参数)
管道选项。查看为(设置选项)。保存主会话=真
p=梁.管道(选项=管道\选项)
句子=句子=[“这是第一句话”,
“第二句话”,
“还有一句话”]
最长句子=(p
|“读句子”>>beam.Create(句子)
|“拆分为单词”>>beam.Map(lambda x:x.Split(“”))
|'Map Token Length'>>beam.Map(lambda x:(x,len(x)))
|'Top-statement'>>combine.Top.Of(1,lambda a,b:a[1]>beam.ParDo(SaveMaxFn())
结果=p.运行()
结果。等待直到完成()
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
logging.getLogger().setLevel(logging.INFO)
运行()