Python 在从csv文件读取输入时,如何在Cassandra中插入数据以达到50k/sec的吞吐量?
我的目标是提高Cassandra中版本控制数据的吞吐量。我使用了并发读写,还增加了代码从文件中读取的块大小。我的机器是16gb,有8个内核,是的,我已经将Cassandra的yaml文件更改为10k并发读写,当计时时,我发现10000次写/读不到一秒钟。 我的最小可行代码是:Python 在从csv文件读取输入时,如何在Cassandra中插入数据以达到50k/sec的吞吐量?,python,python-3.x,cassandra,cql,Python,Python 3.x,Cassandra,Cql,我的目标是提高Cassandra中版本控制数据的吞吐量。我使用了并发读写,还增加了代码从文件中读取的块大小。我的机器是16gb,有8个内核,是的,我已经将Cassandra的yaml文件更改为10k并发读写,当计时时,我发现10000次写/读不到一秒钟。 我的最小可行代码是: import json import logging import os import sys from datetime import datetime from hashlib import sha256, sha51
import json
import logging
import os
import sys
from datetime import datetime
from hashlib import sha256, sha512, sha1
import pandas as pd
from cassandra import ConsistencyLevel, WriteTimeout
from cassandra.cluster import (EXEC_PROFILE_DEFAULT, BatchStatement, Cluster,
ExecutionProfile)
from cassandra.concurrent import (execute_concurrent,
execute_concurrent_with_args)
from cassandra.query import SimpleStatement, dict_factory
class PythonCassandraExample:
def __init__(self, file_to_be_versioned, working_dir=os.getcwd(), mode='append'):
self.cluster = None
self.session = None
self.keyspace = None
self.log = None
self.mode = mode
self.file_to_be_versioned = file_to_be_versioned
self.insert_patch = []
self.delete_patch = []
self.update_patch = []
self.working_dir = working_dir
def __del__(self):
self.cluster.shutdown()
def createsession(self):
profile = ExecutionProfile(
row_factory=dict_factory,
request_timeout=6000
)
self.cluster = Cluster(
['localhost'],
connect_timeout=50,
execution_profiles={
EXEC_PROFILE_DEFAULT: profile
}
)
self.session = self.cluster.connect(self.keyspace)
def getsession(self):
return self.session
# How about Adding some log info to see what went wrong
def setlogger(self):
log = logging.getLogger()
log.setLevel('INFO')
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
log.addHandler(handler)
self.log = log
# Create Keyspace based on Given Name
def handle_error(self, exception):
self.log.error("Failed to fetch user info: %s", exception)
def createkeyspace(self, keyspace):
"""
:param keyspace: The Name of Keyspace to be created
:return:
"""
# Before we create new lets check if exiting keyspace; we will drop that and create new
self.log.info("creating keyspace...")
self.session.execute("""
CREATE KEYSPACE IF NOT EXISTS %s
WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': '1' }
""" % keyspace)
self.log.info("setting keyspace...")
self.keyspace = keyspace
self.session.set_keyspace(self.keyspace)
def create_table_and_set_version(self, table_name):
self.table_name = table_name.lower()
table_select_query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name='{}' AND table_name='{}'".format(
self.keyspace, self.table_name)
print(table_select_query)
table_exists = self.session.execute(table_select_query).one()
self.log.info("Table exists: {}".format(table_exists))
if table_exists:
self.log.info(
"Datapackage already exists! Checking the last version")
self.version = self.session.execute(
"SELECT version FROM {} LIMIT 1".format(self.table_name)).one()
self.log.info(
"The version fetched is: {} of type: {}".format(
self.version, type(self.version)
)
)
if not self.version:
self.version = 0
else:
self.version = self.version['version']
else:
self.log.info("Table didn't exist!")
self.version = 0
self.target_version = int(str(self.version)) + 1
self.log.info(
"Current and candidate versions are: {}, {}".format(
self.version,
self.target_version
)
)
# c_sql = "CREATE TABLE IF NOT EXISTS {} (id varchar, version int, row varchar, row_hash varchar, PRIMARY KEY(id, version)) with clustering order by (version desc)".format(
# self.table_name)
c_sql = "CREATE TABLE IF NOT EXISTS {} (id varchar, version int, row_hash varchar, PRIMARY KEY(version, id))".format(
self.table_name
)
self.session.execute(c_sql)
self.log.info("DP Table Created !!!")
self.log.info("Current and candidate versions are: {}, {}".format(
self.version, self.target_version))
def push_to_update_patch(self, update_patch_file, last_patch=False):
if len(self.update_patch) >= 10000:
with open(update_patch_file, mode='a') as json_file:
json_file.writelines(
self.update_patch
)
del self.update_patch[:]
if last_patch is True and len(self.update_patch) > 0:
with open(update_patch_file, mode='a') as json_file:
json_file.writelines(
self.update_patch
)
del self.update_patch[:]
def push_to_insert_patch(self, insert_patch_file, last_patch=False):
if len(self.insert_patch) >= 10000:
with open(insert_patch_file, mode='a') as json_file:
json_file.writelines(
self.insert_patch
)
del self.insert_patch[:]
if last_patch is True and len(self.update_patch) > 0:
with open(insert_patch_file, mode='a') as json_file:
json_file.writelines(
self.insert_patch
)
del self.insert_patch[:]
def push_to_delete_patch(self, delete_patch_file, last_patch=False):
if len(self.delete_patch) >= 10000:
with open(delete_patch_file, mode='a') as json_file:
json_file.writelines(
self.delete_patch
)
del self.delete_patch[:]
if last_patch is True and len(self.delete_patch) > 0:
with open(delete_patch_file, mode='a') as json_file:
json_file.writelines(
self.delete_patch
)
del self.delete_patch[:]
def push_to_patch(self, key, value, mode='update'):
return
if key is None or value is None:
raise ValueError(
"Key or value or both not specified for making a patch. Exiting now."
)
data = {}
data["id"] = str(key)
data["data"] = json.dumps(value, default=str)
# convert dict to json str so that the patch is a list of line jsons.
data = json.dumps(data, default=str)
json_patch_file = os.path.join(
self.working_dir,
"version_{}_{}.json".format(
self.target_version, mode
)
)
if mode == 'update':
self.update_patch.append(
data + "\n"
)
self.push_to_update_patch(
json_patch_file
)
if mode == 'insert':
self.insert_patch.append(
data + "\n"
)
self.push_to_insert_patch(
json_patch_file
)
if mode == 'delete':
self.delete_patch.append(
data + "\n"
)
self.push_to_delete_patch(
json_patch_file
)
def clone_version(self):
if self.mode == 'replace':
return
self.log.info("Cloning version")
start_time = datetime.utcnow()
if self.version == 0:
return
insert_sql = self.session.prepare(
(
"INSERT INTO {} ({}, {}, {}) VALUES (?,?,?)"
).format(
self.table_name, "id", "version", "row_hash"
)
)
futures = []
current_version_query = "SELECT id, row_hash FROM {} WHERE version={}".format(
self.table_name, self.version
)
current_version_rows = self.session.execute(
current_version_query
)
for current_version_row in current_version_rows:
params = (
current_version_row['id'],
self.target_version,
current_version_row['row_hash']
)
futures.append(
(
insert_sql,
params
)
)
self.log.info(
"Time taken to clone the version is: {}".format(
datetime.utcnow() - start_time
)
)
def hash_string(self, value):
return (sha1(str(value).encode('utf-8')).hexdigest())
def hash_row(self, row):
row_json = json.dumps(row, default=str)
return (self.hash_string(row_json))
def insert_data(self, generate_diff=False):
self.generate_diff = generate_diff
destination = self.file_to_be_versioned
chunksize = 100000
concurrency_value = 1000
patch_length_for_cql = chunksize
chunks = pd.read_csv(destination, chunksize=chunksize)
chunk_counter = 0
insert_sql = self.session.prepare(
(
"INSERT INTO {} ({}, {}, {}) VALUES (?,?,?)"
).format(
self.table_name, "id", "version", "row_hash"
)
)
select_sql = self.session.prepare(
(
"SELECT id, version, row_hash FROM {} WHERE version=? AND id=?"
).format(
self.table_name
)
)
futures = []
check_for_patch = [] #this list comprises rows with ids and values for checking whether its an update/insert
rows_for_checking_patch = []
start_time = datetime.utcnow()
for df in chunks:
rows_for_checking_patch = df.values.tolist()
chunk_counter += 1
df["row_hash"] = df.apply(
self.hash_row
)
df["key"] = df["column_test_3"].apply(
self.hash_string
)
keys = list(df["key"])
row_hashes = list(df["row_hash"])
start_time_de_params = datetime.utcnow()
for i in range(chunksize):
row_check = None
params = (
str(keys[i]),
self.target_version,
str(row_hashes[i])
)
check_for_patch_params = (
self.version,
str(keys[i])
)
check_for_patch.append(
(
select_sql,
check_for_patch_params
)
)
futures.append(
(
insert_sql,
params
)
)
self.log.info("Time for params: {}".format(datetime.utcnow() - start_time_de_params))
if len(check_for_patch) >= patch_length_for_cql:
start_time_de_update = datetime.utcnow()
results = execute_concurrent(
self.session, check_for_patch, concurrency=concurrency_value, raise_on_first_error=False
)
self.log.info("Time for just the query: {}".format(datetime.utcnow() - start_time_de_update))
row_counter_for_patch = 0
for (success, result) in results:
if not result:
self.push_to_patch(
keys[row_counter_for_patch],
rows_for_checking_patch[row_counter_for_patch],
mode='insert'
)
row_counter_for_patch += 1
continue
if not success:
# result will be an Exception
self.log.error("Error has occurred in insert cql")
self.handle_error(result)
id_to_be_compared = result[0]["id"]
row_hash_to_be_compared = result[0]["row_hash"]
if (row_hash_to_be_compared != row_hashes[row_counter_for_patch]):
self.push_to_patch(
id_to_be_compared,
rows_for_checking_patch[row_counter_for_patch]["row"],
mode='update'
)
row_counter_for_patch += 1
del check_for_patch[:]
del rows_for_checking_patch[:]
row_counter_for_patch = 0
self.log.info("Time for check patch: {}".format(
datetime.utcnow() - start_time_de_update
))
if (len(futures) >= patch_length_for_cql):
start_time_de_insert = datetime.utcnow()
results = execute_concurrent(
self.session, futures, concurrency=concurrency_value, raise_on_first_error=False
)
for (success, result) in results:
if not success:
# result will be an Exception
self.log.error("Error has occurred in insert cql")
self.handle_error(result)
del futures[:]
self.log.info("Time for insert patch: {}".format(
datetime.utcnow() - start_time_de_insert
))
self.log.info(chunk_counter)
# self.log.info("This chunk got over in {}".format(datetime.utcnow() - start_time))
if len(check_for_patch) > 0:
results = execute_concurrent(
self.session, check_for_patch, concurrency=concurrency_value, raise_on_first_error=False
)
row_counter_for_patch = 0
for (success, result) in results:
if not result:
self.push_to_patch(
rows_for_checking_patch[row_counter_for_patch]["id"],
rows_for_checking_patch[row_counter_for_patch]["row"],
mode='insert'
)
row_counter_for_patch += 1
continue
if not success:
# result will be an Exception
self.log.error("Error has occurred in insert cql")
self.handle_error(result)
id_to_be_compared = result[0]["id"]
row_hash_to_be_compared = result[0]["row_hash"]
if (row_hash_to_be_compared != rows_for_checking_patch[row_counter_for_patch]["row_hash"]):
self.push_to_patch(
id_to_be_compared,
rows_for_checking_patch[row_counter_for_patch]["row"],
mode='update'
)
row_counter_for_patch += 1
del check_for_patch[:]
del rows_for_checking_patch[:]
if len(futures) > 0: # in case the last dataframe has #rows < 10k.
results = execute_concurrent(
self.session, futures, concurrency=concurrency_value, raise_on_first_error=False
)
for (success, result) in results:
if not success:
self.handle_error(result)
del futures[:]
self.log.info(chunk_counter)
# Check the delete patch
if self.generate_diff is True and self.mode is 'replace' and self.version is not 0:
self.log.info("We got to find the delete patch!")
start_time = datetime.utcnow()
current_version_query = "SELECT id, row, row_hash FROM {} WHERE version={}".format(
self.table_name, self.version
)
current_version_rows = self.session.execute(
current_version_query
)
for current_version_row in current_version_rows:
row_check_query = "SELECT {} FROM {} WHERE {}={} AND {}='{}' ".format(
"id", self.table_name, "version", self.target_version, "id", current_version_row.id
)
row_check = self.session.execute(row_check_query).one()
if row_check is not None: # row exists in both version.
continue
self.push_to_patch(
current_version_row.id,
current_version_row.id,
mode="delete"
)
print("Complete insert's duration is: {}".format(
datetime.utcnow() - start_time)
)
# Calling last_patch for all remaining diffs
modes = [
'update',
'insert',
'delete'
]
for mode in modes:
json_patch_file = os.path.join(
self.working_dir,
"version_{}_{}.json".format(
self.target_version, mode
)
)
if mode == 'update':
self.push_to_update_patch(
json_patch_file,
last_patch=True
)
if mode == 'insert':
self.push_to_insert_patch(
json_patch_file,
last_patch=True
)
if mode == 'delete':
self.push_to_delete_patch(
json_patch_file,
last_patch=True
)
if __name__ == '__main__':
example1 = PythonCassandraExample(
file_to_be_versioned="hundred_million_eleven_columns.csv"
)
example1.createsession()
example1.setlogger()
example1.createkeyspace('sat_athena_one')
example1.create_table_and_set_version('five_hundred_rows')
example1.clone_version()
example1.insert_data(generate_diff=True)
我的cassandra的yaml文件是请确保您的代码甚至可以以50k的速度生成这么多。如果您删除了execute,您甚至可以读取CSV并以如此快的速度生成sha吗?在这个大小的带有SSD的主机上,一个C*实例应该能够每秒进行50k次写操作,但在C*写操作之外,还有很多操作可能是问题的一部分 如果并发读/写超过128,则会出现一些严重问题。在一个能够处理它的系统上,64次甚至足以超过每秒20万次写入。这样高的设置实际上会让事情变得更糟。这不涉及IO,因此正如文档中所述,8倍的核心计数是一个很好的值。我甚至建议将推送的并发度从10k降低到1024或更低。您可以使用不同的设置来查看它对事物的影响 确保python是在安装时使用cython编译的,否则它将在序列化中占主导地位。说到python驱动程序,它是最慢的,所以请记住这一点 您在sha上的阻塞可能是大多数时间的阻塞。如果没有perf跟踪,只需使用固定值进行尝试即可看到差异
“我的机器”--这是单节点群集吗?如果将可用性抛出窗口,那么最好禁用键空间上的持久_写入,以加快写入速度。缺少堆设置,但请确保至少有8gb,即使这是一个非常小的主机cassandra需要内存。如果不读,考虑禁用键缓存并在作业运行时禁用压缩,然后启用。
确保代码在50K时甚至可以生成那么多。如果您删除了execute,您甚至可以读取CSV并以如此快的速度生成sha吗?在这个大小的带有SSD的主机上,一个C*实例应该能够每秒进行50k次写操作,但在C*写操作之外,还有很多操作可能是问题的一部分
如果并发读/写超过128,则会出现一些严重问题。在一个能够处理它的系统上,64次甚至足以超过每秒20万次写入。这样高的设置实际上会让事情变得更糟。这不涉及IO,因此正如文档中所述,8倍的核心计数是一个很好的值。我甚至建议将推送的并发度从10k降低到1024或更低。您可以使用不同的设置来查看它对事物的影响 确保python是在安装时使用cython编译的,否则它将在序列化中占主导地位。说到python驱动程序,它是最慢的,所以请记住这一点 您在sha上的阻塞可能是大多数时间的阻塞。如果没有perf跟踪,只需使用固定值进行尝试即可看到差异“我的机器”--这是单节点群集吗?如果将可用性抛出窗口,那么最好禁用键空间上的持久_写入,以加快写入速度。缺少堆设置,但请确保至少有8gb,即使这是一个非常小的主机cassandra需要内存。如果不读,考虑禁用密钥缓存并在作业运行时禁用压缩,然后再启用。<>注释评论8个核的数量。 另一方面,由于写入几乎从不受IO限制,因此理想 “并发_写入”的数量取决于中的内核数量 你的系统;(8*内核的数量)是一个很好的经验法则 64适用于8芯机器
- 并发读数:64
- 并发写入:64
- 并发计数器写入:64
- 磁盘优化策略:ssd//如果您的磁盘是hdd,请将值更改为旋转
- 使用专用的提交日志磁盘。建议使用ssd
- 更多磁盘=更好的性能
- 并发读数:64
- 并发写入:64
- 并发计数器写入:64
- 磁盘优化策略:ssd//如果您的磁盘是hdd,请将值更改为旋转
- 使用专用的提交日志磁盘。建议使用ssd
- 更多磁盘=更好的性能
check\u for\u patch
查询是一个select语句,只需22秒,而不迭代结果。如何减少select查询?要提高读取性能,请将keycache添加回。。。错过了。这是多少个查询?并发读取设置为1k或64会产生相同的结果:100000行/22秒。另外,我应该在哪里添加keycache?是的,它是一个单机集群。最终,它将在运行时启用多群集设置
import csv
import sys
import os
import pandas as pd
file_name = "hundred_million_eleven_columns.csv"
rows_list = []
chunk_counter = 1
headers = [
"column_test_1",
"column_test_2",
"column_test_3",
"column_test_4",
"column_test_5",
"column_test_6",
"column_test_7",
"column_test_8",
"column_test_9",
"column_test_10",
"column_test_11",
]
file_exists = os.path.isfile(file_name)
with open(file_name, 'a') as csvfile:
writer = csv.DictWriter(csvfile, delimiter=',',
lineterminator='\n', fieldnames=headers)
if not file_exists:
writer.writeheader() # file doesn't exist yet, write a header
for i in range(100000000):
dict1 = [
i, i+1, i+2, i+3, i+4, i+5, i+6, i+7, i+8, i+9, i+10
]
# get input row in dictionary format
# key = col_name
rows_list.append(dict1)
if len(rows_list) == 100000:
df = pd.DataFrame(rows_list)
df.to_csv(file_name,
mode='a', index=False, header=False)
del rows_list[:]
del df
print(chunk_counter)
chunk_counter += 1
if len(rows_list) > 0:
df = pd.DataFrame(rows_list)
df.to_csv(file_name, mode='a', index=False, header=False)
del rows_list[:]
del df
print(chunk_counter)
chunk_counter += 1