String cython-将字符串转换为整数和浮点数_String_Floating Point_Int_Cython

String cython-将字符串转换为整数和浮点数

string floating-point

String cython-将字符串转换为整数和浮点数,string,floating-point,int,cython,String,Floating Point,Int,Cython,我有一个在Cython中逐行读取的数据集。每行作为字符串返回。我想做的是将字符串转换成一个数字数组（int和float），其长度等于每行中的列数（由分隔符“；”给出）比如说 import pandas as pd import numpy as np df = pd.DataFrame(np.c_[np.random.rand(3,2),np.random.randint(0,10,(3,2))], columns = ['a','b','c','d']) filename = r'H:\

我有一个在Cython中逐行读取的数据集。每行作为字符串返回。我想做的是将字符串转换成一个数字数组（int和float），其长度等于每行中的列数（由分隔符“；”给出）

比如说

import pandas as pd
import numpy as np

df = pd.DataFrame(np.c_[np.random.rand(3,2),np.random.randint(0,10,(3,2))], columns = ['a','b','c','d'])

filename = r'H:\mydata.csv'
df.to_csv('filename',sep=';',index=False)

现在我想在cython中的行上随机迭代，并对每一行进行一些计算

import numpy as np
from readc_csv import row_pos, read_file_and_compute

filename = r'H:\mydata.csv'
row_position = row_pos(filename)[:-1] # returns the position of the start
                                      # of each row in the file
                                      # (excluding the header)

rows = np.random.choice(row_position,size=len(row_position),replace=False)
read_file_and_compute(filename,rows)

readc_csv.pyx文件如下所示

from libc.stdio cimport FILE, fopen, fgets, fclose, fseek, SEEK_SET, ftell
import numpy as np
cimport numpy as np

def row_pos(str filename):
    filename_byte_string = filename.encode("UTF-8")

    cdef:
        char* fname = filename_byte_string
        FILE* cfile
        char line[50]
        list pos = []

    cfile = fopen(fname, "r")

    while fgets(line, 50, cfile)!=NULL:
        pos.append(ftell(cfile))

    fclose(cfile)

    return pos            


def read_file_and_compute(str filename, int [:] rows):
    filename_byte_string = filename.encode("UTF-8")
    cdef:
        char* fname = filename_byte_string
        FILE* cfile
        char line[50]
        size_t j 
        int n = rows.shape[0]

    cfile = fopen(fname, "r")

    for j in range(n):
        r = rows[j]
        fseek(cfile,r,SEEK_SET)
        fgets(line, 50, cfile)

        # line is now e.g. 
        # '0.659933520847;0.471779123704;1.0;2.0\n'
        # I want to convert it into an array with 4 elements
        # each element corresponding to one of the numbers we
        # see in the string
        # and do some computations


    fclose(cfile)

    return

（注意：cython代码尚未优化）背景信息：这是我想为随机梯度下降编写的脚本的一部分，该脚本针对的数据集太大，无法读入内存。我想对cython中随机排序的样本执行内部循环。因此，我需要能够读取cython中csv文件中给定行的数据

我找到了c函数，它可以从

libc.string

和

libc.stdlib

导入。他们做到了

继续上面的例子，

read\u file\u和\u compute

函数可能会像这样

def read_file_and_compute(str filename, int [:] rows, int col_n):
    filename_byte_string = filename.encode("UTF-8")
    cdef:
        char* fname = filename_byte_string
        FILE* cfile
        char line[50]
        char *token
        double *col = <double *>malloc(col_n * sizeof(double))
        size_t j, i 
        int count
        double num
        int n = rows.shape[0]

    cfile = fopen(fname, "r")

    for j in range(n):
        r = rows[j]
        fseek(cfile,r,SEEK_SET)
        fgets(line, 50, cfile)

        token = strtok(line, ';') # splits the string at the delimiter ';'
        count = 0
        while token!=NULL and count<col_n:
            num = atof(token) # converts the string into a float

            col[count] = num
            token = strtok(NULL,';\n')
            count +=1 

        # now do some computations on col ...

fclose(cfile)
free(col)    

return

def read_file_和_compute（str filename，int[：]行，int列）：
filename\u byte\u string=filename.encode（“UTF-8”）
cdef：
char*fname=文件名\字节\字符串
文件*c文件
字符行[50]
字符*标记
double*col=malloc（col_n*sizeof（double））
尺寸j，i
整数计数
双数
int n=行。形状[0]
cfile=fopen（fname，“r”）
对于范围（n）内的j：
r=行[j]
fseek（cfile、r、SEEK\u集）
fgets（第50行，C文件）
token=strtok（行“；”）#在分隔符“；”处拆分字符串
计数=0
而代币=NULL和count我找到了c函数，它可以从libc.string
和libc.stdlib
导入。他们做到了
继续上面的例子，read\u file\u和\u compute
函数可能会像这样
def read_file_and_compute(str filename, int [:] rows, int col_n):
    filename_byte_string = filename.encode("UTF-8")
    cdef:
        char* fname = filename_byte_string
        FILE* cfile
        char line[50]
        char *token
        double *col = <double *>malloc(col_n * sizeof(double))
        size_t j, i 
        int count
        double num
        int n = rows.shape[0]

    cfile = fopen(fname, "r")

    for j in range(n):
        r = rows[j]
        fseek(cfile,r,SEEK_SET)
        fgets(line, 50, cfile)

        token = strtok(line, ';') # splits the string at the delimiter ';'
        count = 0
        while token!=NULL and count<col_n:
            num = atof(token) # converts the string into a float

            col[count] = num
            token = strtok(NULL,';\n')
            count +=1 

        # now do some computations on col ...

fclose(cfile)
free(col)    

return 

def read_file_和_compute（str filename，int[：]行，int列）：
filename\u byte\u string=filename.encode（“UTF-8”）
cdef：
char*fname=文件名\字节\字符串
文件*c文件
字符行[50]
字符*标记
double*col=malloc（col_n*sizeof（double））
尺寸j，i
整数计数
双数
int n=行。形状[0]
cfile=fopen（fname，“r”）
对于范围（n）内的j：
r=行[j]
fseek（cfile、r、SEEK\u集）
fgets（第50行，C文件）
token=strtok（行“；”）#在分隔符“；”处拆分字符串
计数=0
而代币=NULL和count我认为这是一个有用的评论，摘自一个没有抓住要点的答案（因此我删除了）：如果您可以使用二进制文件而不是csv，则可以为二进制文件实现此功能-这显然比编写自己的要容易得多。第二条注释可能会有所帮助：以下Python代码将可以返回np.array（[float（l）for l in str（line）.split（“；”）]）
。它没有经过优化，但你可以在试图找到更好的东西时将其用作占位符。我认为这是一条有用的评论，摘自一个没有抓住要点的答案（因此我删除了）：如果您可以使用二进制文件而不是csv，则可以为二进制文件实现此功能-这显然比编写自己的要容易得多。第二条注释可能会有所帮助：以下Python代码将可以返回np.array（[float（l）for l in str（line）.split（“；”）]）
。它没有经过优化，但您可以在尝试找到更好的方法时将其用作占位符。警告一句：strtok
不能保证线程安全，因此，如果您使用基于C的实现的原因是并行运行，那么请小心！如果你不能并行运行多个版本的代码，那就别担心。警告一句：strtok
不能保证线程安全，因此如果你使用基于C的实现是为了并行运行，那就要小心了！如果你不会同时运行多个版本的软件，那么别担心。