Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 在Databricks上的事件中心设置startingPosition_Python_Apache Spark_Pyspark_Azure Eventhub - Fatal编程技术网

Python 在Databricks上的事件中心设置startingPosition

Python 在Databricks上的事件中心设置startingPosition,python,apache-spark,pyspark,azure-eventhub,Python,Apache Spark,Pyspark,Azure Eventhub,我正在尝试使用PySpark从EventHub读取事件流。 我无法将起始位置设置为流的起始位置。 在Scala中这一点很清楚,但对于Python,我一直得到: org.json4s.package$MappingException: No usable value for offset. 这是我的配置 conf = { "eventhubs.connectionString": "Endpoint=sb://XXXX;SharedAccessKey

我正在尝试使用PySpark从EventHub读取事件流。 我无法将起始位置设置为流的起始位置。 在Scala中这一点很清楚,但对于Python,我一直得到:

org.json4s.package$MappingException: No usable value for offset.
这是我的配置

conf = {
  "eventhubs.connectionString":
      "Endpoint=sb://XXXX;SharedAccessKeyName=XXX;SharedAccessKey=XXXX;EntityPath=XXXX",
  "eventhubs.consumerGroup": "$Default",
  "eventhubs.startingPosition": "-1"
}

在Scala中

val cs = "YOUR.CONNECTION.STRING"
val ehConf = EventHubsConf(cs)
  .setStartingPosition(EventPosition.fromEndOfStream)
参考:

通过PySpark在Python中使用

ehConf = {'eventhubs.connectionString' : connectionString}

startTime = "2020-04-07T01:05:05.662231Z"
endTime = "2020-04-07T01:15:05.662185Z"

startingEventPosition = {
"offset": None,
"seqNo": -1, #not in use
"enqueuedTime": startTime,
"isInclusive": True
}

endingEventPosition = {
"offset": None, #not in use
"seqNo": -1, #not in use
"enqueuedTime": endTime,
"isInclusive": True
}

# Put the positions into the Event Hub config dictionary
ehConf["eventhubs.startingPosition"] = json.dumps(startingEventPosition)
ehConf["eventhubs.endingPosition"] = json.dumps(endingEventPosition)

df = spark.read.format("eventhubs").options(**ehConf).load()
通过SDK在Python中使用

异步使用事件中心中的事件

导入日志
导入异步
从azure.eventhub.aio导入EventHubConsumerClient
连接_str='>'
消费者组='>'
eventhub_name='>'
logger=logging.getLogger(“azure.eventhub”)
logging.basicConfig(级别=logging.INFO)
事件上的异步定义(分区上下文,事件):
info(“从分区{}接收到的事件”。格式(partition_context.partition_id))
等待分区\上下文。更新\检查点(事件)
异步def receive():
client=eventhubcumerclient.from\u connection\u string(connection\u str,consumer\u group,eventhub\u name=eventhub\u name)
与客户端异步:
等待客户(
on_事件=on_事件,
起始_position=“-1”,#“-1”是从分区的开始。
)
#从指定分区接收事件:
#wait client.receive(on_event=on_event,partition_id='0')
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
loop=asyncio.get\u event\u loop()
循环。运行\u直到完成(receive())
异步批量使用事件中心中的事件

导入日志
导入异步
从azure.eventhub.aio导入EventHubConsumerClient
连接_str='>'
消费者组='>'
eventhub_name='>'
logger=logging.getLogger(“azure.eventhub”)
logging.basicConfig(级别=logging.INFO)
事件批处理上的异步定义(分区上下文、事件):
info(“从分区{}接收到的事件”。格式(partition_context.partition_id))
等待分区\上下文。更新\检查点()
异步def接收批处理():
client=eventhubcumerclient.from\u connection\u string(connection\u str,consumer\u group,eventhub\u name=eventhub\u name)
与客户端异步:
等待客户。接收批(
on_event_batch=on_event_batch,
起始_position=“-1”,#“-1”是从分区的开始。
)
#从指定分区接收事件:
#等待客户端。接收\u批(在\u事件\u批=在\u事件\u批,分区\u id='0')
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
loop=asyncio.get\u event\u loop()
循环。运行\u直到完成(接收\u批处理())
使用检查点存储使用事件并保存检查点

导入异步IO
从azure.eventhub.aio导入EventHubConsumerClient
从azure.eventhub.extensions.checkpointstoreblobaio导入BlobCheckpointStore
连接_str='>'
消费者组='>'
eventhub_name='>'
存储\u连接\u str='>'
容器名称=“”
事件上的异步定义(分区上下文,事件):
#做点什么
等待分区上下文。更新检查点(事件)或每N个事件更新检查点,以获得更好的性能。
异步def接收(客户端):
等待客户(
on_事件=on_事件,
起始_position=“-1”,#“-1”是从分区的开始。
)
异步def main():
checkpoint\u store=BlobCheckpointStore.from\u connection\u字符串(storage\u connection\u str,container\u name)
客户端=EventHubConsumerClient.from\u连接\u字符串(
连接(u str),,
消费者联合会,
eventhub\u name=eventhub\u name,
checkpoint_store=checkpoint_store,#用于负载平衡和检查点。保留None表示无负载平衡
)
与客户端异步:
等待接收(客户)
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
loop=asyncio.get\u event\u loop()
循环。运行\u直到完成(main())

参考资料:

如果您喜欢这个答案,并且它对您有效,请批准它并按照Stack Overflow的建议进行投票。谢谢。谢谢你的评论。是的,使用Scala非常清楚。但我似乎可以让它与pyspark一起工作。再次感谢,但我在这里非常具体。我知道如何使用标准的PythonSDK来实现这一点。我正在寻找如何使用PySpark。@Ali Masri,我也添加了PySpark的代码。@AliMasri,我的荣幸。@AliMasri,顺便说一句,由于您喜欢这个答案,而且它对您也很有用,请像批准一样对它进行投票,因为Stack Overflow建议您同时执行这两项操作。谢谢
import logging
import asyncio
from azure.eventhub.aio import EventHubConsumerClient

connection_str = '<< CONNECTION STRING FOR THE EVENT HUBS NAMESPACE >>'
consumer_group = '<< CONSUMER GROUP >>'
eventhub_name = '<< NAME OF THE EVENT HUB >>'

logger = logging.getLogger("azure.eventhub")
logging.basicConfig(level=logging.INFO)

async def on_event(partition_context, event):
    logger.info("Received event from partition {}".format(partition_context.partition_id))
    await partition_context.update_checkpoint(event)

async def receive():
    client = EventHubConsumerClient.from_connection_string(connection_str, consumer_group, eventhub_name=eventhub_name)
    async with client:
        await client.receive(
            on_event=on_event,
            starting_position="-1",  # "-1" is from the beginning of the partition.
        )
        # receive events from specified partition:
        # await client.receive(on_event=on_event, partition_id='0')

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(receive())
import logging
import asyncio
from azure.eventhub.aio import EventHubConsumerClient

connection_str = '<< CONNECTION STRING FOR THE EVENT HUBS NAMESPACE >>'
consumer_group = '<< CONSUMER GROUP >>'
eventhub_name = '<< NAME OF THE EVENT HUB >>'

logger = logging.getLogger("azure.eventhub")
logging.basicConfig(level=logging.INFO)

async def on_event_batch(partition_context, events):
    logger.info("Received event from partition {}".format(partition_context.partition_id))
    await partition_context.update_checkpoint()

async def receive_batch():
    client = EventHubConsumerClient.from_connection_string(connection_str, consumer_group, eventhub_name=eventhub_name)
    async with client:
        await client.receive_batch(
            on_event_batch=on_event_batch,
            starting_position="-1",  # "-1" is from the beginning of the partition.
        )
        # receive events from specified partition:
        # await client.receive_batch(on_event_batch=on_event_batch, partition_id='0')

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(receive_batch())
import asyncio

from azure.eventhub.aio import EventHubConsumerClient
from azure.eventhub.extensions.checkpointstoreblobaio import BlobCheckpointStore

connection_str = '<< CONNECTION STRING FOR THE EVENT HUBS NAMESPACE >>'
consumer_group = '<< CONSUMER GROUP >>'
eventhub_name = '<< NAME OF THE EVENT HUB >>'
storage_connection_str = '<< CONNECTION STRING FOR THE STORAGE >>'
container_name = '<<NAME OF THE BLOB CONTAINER>>'

async def on_event(partition_context, event):
    # do something
    await partition_context.update_checkpoint(event)  # Or update_checkpoint every N events for better performance.

async def receive(client):
    await client.receive(
        on_event=on_event,
        starting_position="-1",  # "-1" is from the beginning of the partition.
    )

async def main():
    checkpoint_store = BlobCheckpointStore.from_connection_string(storage_connection_str, container_name)
    client = EventHubConsumerClient.from_connection_string(
        connection_str,
        consumer_group,
        eventhub_name=eventhub_name,
        checkpoint_store=checkpoint_store,  # For load balancing and checkpoint. Leave None for no load balancing
    )
    async with client:
        await receive(client)

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())