Python:如何在大数据帧上加速请求函数

Python:如何在大数据帧上加速请求函数,python,performance,parallel-processing,python-requests,bigdata,Python,Performance,Parallel Processing,Python Requests,Bigdata,我有一张有3800万行的桌子。一列是url。我需要找到每一行的url,提取并处理xml以将其保存到新列中。 我使用Pool将整个过程并行化为块,同时也使用pandas数据帧。我正在使用100%的8核容量和1000Mbps的互联网,计算过程将在12天内结束。 关于如何改进这一点有什么建议吗 class Receptores(): def aux_tupla(self, df): df['aux_tupla'] = df['uri'].str.replace('/v01/',

我有一张有3800万行的桌子。一列是url。我需要找到每一行的url,提取并处理xml以将其保存到新列中。 我使用Pool将整个过程并行化为块,同时也使用pandas数据帧。我正在使用100%的8核容量和1000Mbps的互联网,计算过程将在12天内结束。 关于如何改进这一点有什么建议吗

class Receptores():
    def aux_tupla(self, df):
        df['aux_tupla'] = df['uri'].str.replace('/v01/', '/depot/').apply(lambda x: self.uriToDicts3(x))
        return df

    def uriToDicts3(self, url):
        regex = self.URL_REGEX.match(url)
        path = "%s/%s/%s/%s.gz" % (regex.group(1), regex.group(2), regex.group(4), regex.group(5))
        _file = self.bucket.get_key(path, validate=False)
        compressed_file = BytesIO()
        try:
            _file.get_file(compressed_file)
            compressed_file.seek(0)
            decompressed_file = gzip.GzipFile(fileobj=compressed_file, mode='rb')
            rq = decompressed_file.read()

        except boto.exception.S3ResponseError as ex:
            print("Error >>", ex.message)
            return json.dumps({}), json.dumps({})

        soup = bs(rq, 'xml')
        detalle = soup.find('Detalle')
        detalle = json.dumps(xmltodict.parse(str(detalle)))
        dictionary = {}
        for key in self.datos_adicionales:
            try:
                value = soup.find(key)
                if value is None:
                    value = soup.find(text=re.compile(key)).parent.parent.find('ValorDA').get_text()
                else:
                    value = value.get_text()
                dictionary[key] = value
            except Exception:
                continue
        dictionary = json.dumps(dictionary)
        return dictionary, detalle


    def pool_only(self, df):
        df_split = np.array_split(df, 8)
        pool = Pool(8)
        df = pd.concat(pool.map(self.aux_tupla, df_split))
        pool.close()
        pool.join()
        return df

    def main(self, dia, choice='pool'):
        t1 = time.time()
        df = self.getUris(dia, limit=True)
        print('FINISHED: {} get Uris in {}'.format(dia, time.time() - t1))
        if choice == 'pool':
        df = self.pool_only(df)
        elif choice == 'combined':
            self.pool(df)
            df = pd.concat(self.dfs)
            print([i.shape[0] for i in self.dfs])
        elif choice == 'thread':
            self.thread_only(df)
            df = pd.concat(self.dfs)
            print([i.shape[0] for i in self.dfs])
        else:
            df['aux_tupla'] = df['uri'].str.replace('/v01/', '/depot/').apply(lambda x: self.uriToDicts3(x))
        print('FINISHED: {} , {} rows uriToDicts3 in {} hours'.format(dia, df.shape[0], (time.time() - t1) / 3600))
        df[['data_adicional', 'detalle']] = df['aux_tupla'].apply(pd.Series)
        df.drop('aux_tupla', axis=1, inplace=True)
        # self.insert_table(df)
        return df

def parallel(dia):
    t1 = time.time()
    a = Receptores().main(dia, choice='pool')
    a.to_csv('{}.csv'.format(dia), index=False)
    # print('LISTO {} - {}'.format(dia, time.time() - t1))
    return a

if __name__ == '__main__':
    t1 = time.time()
    # df = pd.read_csv('dia_emision_batch.csv')
    # dias = [str(i) for i in df.loc[:, 'dia_emision']]
    dias = ['20180101', '20170910', '20170730']
    for i in dias:
        if os.path.exists('{}.csv'.format(i)):
            print('Already exists:', i)
            continue
        try:
            parallel(i)
        except Exception:
            print('Failed!', i)
    print('TOTAL TIME: {}'.format((time.time() - t1) / 3600))

你能把你的密码贴出来让我们看看吗?嘿!我添加了代码的重要部分,有些部分无关紧要,pool_-only方法,aux_-tupla,main是我认为重要的部分。我有其他的并行化方法,但速度更快(我也尝试将线程与池结合起来,但没有成功)。另外,“dias”要长得多,我在这个例子中缩短了它。