Python MYSQL数据插入两次

Python MYSQL数据插入两次,python,mysql,Python,Mysql,当在tweet URL中找到单个散列时,脚本会将值正确地插入到MYSQLDB中。当在tweet URL中找到2个或更多散列时,将在MYSQL数据库中插入记录两次 例如,如果一条tweet有2个URL,其中提到了散列,那么在MYSQL DB中会创建4条记录 DB状态: "https://www.virustotal.com/en/file/2819e520dea611c4dd1c3b1fd54adbd0c50963ff75d67cc7facbe2090574afc0/analysis/","201

当在tweet URL中找到单个散列时,脚本会将值正确地插入到
MYSQL
DB中。当在tweet URL中找到2个或更多散列时,将在MYSQL数据库中插入记录两次

例如,如果一条tweet有2个URL,其中提到了散列,那么在MYSQL DB中会创建4条记录

DB状态:

"https://www.virustotal.com/en/file/2819e520dea611c4dd1c3b1fd54adbd0c50963ff75d67cc7facbe2090574afc0/analysis/","2017-09-20 01:00:35","2819e520dea611c4dd1c3b1fd54adbd0c50963ff75d67cc7facbe2090574afc0"
"https://www.virustotal.com/en/file/8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89/analysis/","2017-09-20 01:03:35","8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89"
"https://www.virustotal.com/en/file/b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0/analysis/","2017-09-20 01:03:35","8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89"
"https://www.virustotal.com/en/file/8084880e875b4dc97ccd9f97249d4c7184f6be092679d2b272ece2890306ca89/analysis/","2017-09-20 01:03:35","b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0"
"https://www.virustotal.com/en/file/b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0/analysis/","2017-09-20 01:03:35","b5034183d4d2aca1e586b4a4bf22f32e4204c4b6d288c171d5252636c11248a0"
关于如何只在DB中插入单个条目,有什么建议吗

#! /usr/bin/python

from __future__ import print_function
import tweepy
import json
import MySQLdb
import time
import json, urllib, urllib2, argparse, hashlib, re, sys
from dateutil import parser

WORDS = ['virustotal']

CONSUMER_KEY = "XXXX"
CONSUMER_SECRET = "YYY"
ACCESS_TOKEN = "AAAA"
ACCESS_TOKEN_SECRET = "DDDDD"


HOST = "192.168.150.1"
USER = "admin"
PASSWD = "admin"
DATABASE = "twitter"


def store_data(values, insert_time, insert_hash):
    db=MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, db=DATABASE, charset="utf8")
    cursor = db.cursor()
    data = []
    #print(hashes)
    for value in values:
         data.append((value, insert_time, insert_hash))
    cursor.executemany("""INSERT INTO tweet_url VALUES (%s,%s,%s)""",data)
    db.commit()
    cursor.close()
    db.close()
    return

class StreamListener(tweepy.StreamListener):

    def on_connect(self):
        print("We are now connected to the streaming API.")

    def on_error(self, status_code):
        print('An Error has occured: ' + repr(status_code))
        return False

    def on_data(self, data):
      try:
        datajson = json.loads(data)
        web_url= datajson['entities']['urls']
        #print(web_url)
        urls=[]
        for i in web_url:
            urls.append((i['expanded_url']))
        values = [list([item]) for item in urls]
        list_url = ','.join([str(i) for i in values])
        extract_url=str(list_url)
        formatted_url=''.join(extract_url)
        sha256_hash=re.findall(r"([a-fA-F\d]{64})", formatted_url)
        hashes=''.join(sha256_hash)
        insert_time=time.strftime('%Y-%m-%d %H:%M:%S')
        hash_list=re.findall(r"([a-fA-F\d]{64})", hashes)
        for insert_hash in hash_list:
             store_data(values, insert_time, insert_hash)
        print(store_data)
        print(hashes)
        print(type(hashes))
      except Exception as e:
             print(e)



auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
listener = StreamListener(api=tweepy.API(wait_on_rate_limit=True))
streamer = tweepy.Stream(auth=auth, listener=listener)
print("Tracking: " + str(WORDS))
streamer.filter(track=WORDS)

您有第一个循环:

for insert_hash in hash_list:
    store_data(values, insert_time, insert_hash)
然后再次循环这些值以构建元组的数据列表:

for value in values:
    data.append((value, insert_time, insert_hash))
因此,这些值被调用两次


也许您可以使用
zip()
enumerate()
在调用
store\u data
之前加入
hash\u列表

data = []
if len(values) == len(hash_list):
    for val,hash in zip(values, hash_list):
        data.append((val, insert_time, hash))
    store_data(data)
然后,无需在
store_data()
中再次循环,只需更改签名以直接传递数据列表:

def store_data(data_list):
    # connection to database
    cursor.executemany("""INSERT INTO tweet_url VALUES (%s,%s,%s)""",data_list)

与SQL相比,我的Python技能有些欠缺,但似乎缺少了一个
JOIN
之类的东西。在您的示例数据中,URL和散列是交叉连接的(例如,您的URL和散列看起来像
[abc,abc],[abc,def],[def,abc],[def,def]
Moureu:使用了上述方法。效果非常好。谢谢you@Arun谢谢。对于这种循环问题的小提示,在调用函数之前,在内部添加一些
打印
(或任何记录器),这有助于理解作为参数给出的值