与使用cron或subprocess的其他python程序同时使用时，python程序的结果是错误的，但单独运行时效果良好_Python_Concurrency_Cron_Web Scraping_Beautifulsoup

与使用cron或subprocess的其他python程序同时使用时，python程序的结果是错误的，但单独运行时效果良好

python concurrency cron web-scraping

与使用cron或subprocess的其他python程序同时使用时，python程序的结果是错误的，但单独运行时效果良好,python,concurrency,cron,web-scraping,beautifulsoup,Python,Concurrency,Cron,Web Scraping,Beautifulsoup,我面临一个奇怪的问题首先，我将解释代码的用途这是一个用python编写的网页垃圾代码，它从网站中提取信息并将它们插入我的本地主机整个计划就是这样 #!/usr/bin/python import urllib from bs4 import BeautifulSoup import MySQLdb import re import pdb import sys def getting_urls_of_all_pages(): i=1 while i<=

我面临一个奇怪的问题

首先，我将解释代码的用途

这是一个用python编写的网页垃圾代码，它从网站中提取信息并将它们插入我的本地主机

整个计划就是这样

#!/usr/bin/python
import urllib
from bs4 import BeautifulSoup

import MySQLdb
import re
import pdb
import sys





def getting_urls_of_all_pages(): 

    i=1
    while i<=40: #40 is the total number of main pages
        url_rent_flat='http://dubai.dubizzle.com/property-for-rent/residential/apartmentflat/?page='+str(i) #url of the main page (iterating to 40)

        
        link=[]
        htmlfile=urllib.urlopen(url_rent_flat).read()
        soup=BeautifulSoup(htmlfile)

        link=soup.find_all('a',xtclib=re.compile("listing_list_\d+_title_link"),href=True) #stores all the links (25) links of the page
    
        """
        Part 2: passing each property url to process for data extraction
        """
        
        for a in link:
            every_property_in_a_page_data_extraction(a['href']) 
        
        i+=1


def every_property_in_a_page_data_extraction(url):

    

    # global count_prop
    
    # count_curr=""
    # date_result=""
    title_result=""
    price_result=""
    bedroom_result="" 
    agencyfee_result="" 
    bathroom_result="" 
    size_result=""
    propertyref_result=""
    furnished_result=""
    rent_is_paid_result=""
    building_result=""
    Amenities_result=""
    tradename_result=""
    licencenum_result=""
    reraid_result=""
    phone_result=""
    link_result=""
    Zoned_for_result=""
    Freehold_result=""
    Pricepersq_result=""
    Type_result="Apartment"
    Sale_Rent_result="Rent"
    rows=0

    


    """
    Part1: Extracting data
    """
    
    htmlfile=urllib.urlopen(url).read()
    soup=BeautifulSoup(htmlfile)

    """
    Part2: Extracting the components
    """

    #date
    # try:
    #   date=soup.find("h3","listing-details-header") 
    #   date_result= str(date.get_text().encode("utf-8").strip()[20:])
    #   print "\nPublished date: ", date_result
    # except StandardError as e:
    #   phone_result="Error was {0}".format(e)
    #   print phone_result

    # Sale/Rent
    print "Sale/Rent: ", Sale_Rent_result

    # Type of property
    print "Type of property: ", Type_result

    #title
    try:
        title= soup.find('span',{'id':'listing-title-wrap'})
        title_result= str(title.get_text().strip().encode("utf-8"))
        print "Title: ",title_result
    except StandardError as e:
        title_result="Error was {0}".format(e)
        print title_result

    #price
    try:
        price = soup.find('span',{'id':'actualprice'})
        price_result= str(price.get_text())
        print "Price: ",price_result
    except StandardError as e:
        price_result="Error was {0}".format(e)
        print price_result

    #Agency Fee, Bedroom, Bathroom, Size
    spans_ABBS= []
    for a in soup.select(".important-fields li span"):
        spans_ABBS.append(a.text.strip())

    strongs_ABBS=[]
    for a in soup.select(".important-fields li strong"):
        strongs_ABBS.append(a.text.strip())

    
    for name, value in zip(spans_ABBS, strongs_ABBS):
        if name=="Agency Fees:":
            try:
                agencyfee_result= str(value)
                print "Agency Fees: ", agencyfee_result
            except StandardError as e:
                agencyfee_result="Error was {0}".format(e)
                print agencyfee_result

        elif name=="Bedrooms:":
            try:
                bedroom_result= str(value)
                print "Number of Bedrooms: ",bedroom_result
            except StandardError as e:
                bedroom_result="Error was {0}".format(e)
                print bedroom_result

        elif name=="Bathrooms:":
            try:
                bathroom_result= str(value)
                print "Number of Bathrooms: ", bathroom_result
            except StandardError as e:
                bathroom_result="Error was {0}".format(e)
                print bathroom_result

        elif name=="Size:":
            try:
                size_result= str(value)
                print "Size of the property: ",size_result
            except StandardError as e:
                size_result="Error was {0}".format(e)
                print size_result

        elif name=="Zoned For:":
            try:
                Zoned_for_result= str(value)
                print "Zoned For:",Zoned_for_result
            except StandardError as e:
                Zoned_for_result="Error was {0}".format(e)
                print Zoned_for_result

        elif name=="Freehold:":
            try:
                Freehold_result= str(value)
                print "Freehold: ",Freehold_result
            except StandardError as e:
                Freehold_result="Error was {0}".format(e)
                print Freehold_result

        elif name=="Price / SqFt:":
            try:
                Pricepersq_result= str(value)
                print "Price Per Sqft: ",Pricepersq_result
            except StandardError as e:
                Pricepersq_result="Error was {0}".format(e)
                print Pricepersq_result

    #Property Reference, Furnished, Listed By, Rent Is Paid, Building, Amenities: 
    spans_others=[]
    for a in soup.select("#listing-details-list li span"):
            spans_others.append(a.text.strip())

    strongs_others=[]
    for a in soup.select("#listing-details-list li strong"):
        strongs_others.append(a.text.strip())


    
    for name, value in zip(spans_others, strongs_others):
        if name=="Listed by:":
            break
        
        elif name=="Property Reference:":
            try:
                propertyref_result=str(value.strip())
                print "Property reference in Dubizel: ",propertyref_result
            except StandardError as e:
                propertyref_result="Error was {0}".format(e)
                print propertyref_result

        elif name=="Furnished:":
            try:
                furnished_result=str(value.strip())
                print "Furnished status: ",furnished_result
            except StandardError as e:
                furnished_result="Error was {0}".format(e)
                print furnished_result


        elif name=="Rent Is Paid:":
            try:
                rent_is_paid_result=str(value.strip())
                print "Rent payment: ",rent_is_paid_result
            except StandardError as e:
                rent_is_paid_result="Error was {0}".format(e)
                print rent_is_paid_result
    
        elif name=="Building:":
            try:
                building_result=str(value.strip())
                print "Building info: ",building_result
            except StandardError as e:
                building_result="Error was {0}".format(e)
                print building_result
        elif name=="Amenities:":
            try:
                for a in value.split(","):
                    Amenities_result+=a.strip()+","
                print Amenities_result
            except StandardError as e:
                Amenities_result="Error was {0}".format(e)
                print Amenities_result
            


    #Agents info --> TTrade Name, DED Licence Number, RERA Registration Number
    spans_broker=[]
    for a in soup.select("#broker-details li span"):
            spans_broker.append(a.text.strip())

    strongs_broker=[]
    for a in soup.select("#broker-details  li strong"):
        strongs_broker.append(a.text.strip())

    
    for name, value in zip(spans_broker, strongs_broker):
        if name=="Trade Name:":
            try:
                tradename_result=str(value.strip())
                print "Trade name: ",tradename_result
            except StandardError as e:
                tradename_result="Error was {0}".format(e)
                print tradename_result

        elif name=="DED Licence Number:":
            try:
                licencenum_result=str(value.strip())
                print "Licence #: ",licencenum_result
            except StandardError as e:
                licencenum_result="Error was {0}".format(e)
                print licencenum_result

        elif name=="RERA Registration Number:":
            try:
                reraid_result=str(value.strip())
                print "RERA ID #: ",reraid_result
            except StandardError as e:
                reraid_result="Error was {0}".format(e)
                print reraid_result


    # phone num
    try:
        phone=soup.find_all("div", "phone-content")
        for a in phone:
            phone_result= str(a.get_text().strip().encode("utf-8"))
        print "Phone information:", phone_result
    except StandardError as e:
        phone_result="Error was {0}".format(e)
        print phone_result

        

    #link
    try:
        link = soup.find('input',{'id':'short-link-input'})
        link_result= str(link.get('value'))
        print "Short Reference link: ", link_result
    except StandardError as e:
        link_result="Error was {0}".format(e)
        print link_result


    #double check of the types before conversion
    # print map(type, (date_result, Sale_Rent_result,Type_result, title_result, price_result, Pricepersq_result, bedroom_result, agencyfee_result, bathroom_result, size_result, propertyref_result, furnished_result, rent_is_paid_result, building_result, Amenities_result, tradename_result, licencenum_result, reraid_result, phone_result, link_result))



    
    # count_prop+=1

    """
    Connecting to Database and putting data into in
    """
    
    db= MySQLdb.connect("localhost","root","ahmed","practice")
    cursor=db.cursor()

    #checking phase to stop scrapping
    sql = """SELECT Short_link FROM Properties WHERE Short_link=%s"""
    print rows
    rows = cursor.execute(sql,(link_result))
    print rows
    if rows>=1:
        print "Already present - The program is terminating"
        sys.exit()
    else:
        query="""INSERT INTO Properties (Sale_Rent, Type, Title,Price, PricePerSqrFt, Bedroom,Agency_Fee, Bathroom, Size,ZonedFor, Freehold, Prop_ref,Furnished_status,Rent_payment,Building_info,Amenities,Trade_name,Licence, RERA_ID,Phone_info,Short_link) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
        cursor.execute(query,(Sale_Rent_result,Type_result, title_result, price_result, Pricepersq_result, bedroom_result, agencyfee_result, bathroom_result, size_result,Zoned_for_result, Freehold_result, propertyref_result, furnished_result, rent_is_paid_result, building_result, Amenities_result, tradename_result, licencenum_result, reraid_result, phone_result, link_result))


    db.commit()
    cursor.close()
    db.close()
    


#-----------------------------------------------------------




# count_prop=1
getting_urls_of_all_pages()

现在，我有3个不同的python程序，它们对3个不同的网站进行相同的提取。我必须同时运行这3个名为AppFlatForRent.py、AppFlatForRent.py、AppFlatForRent.py的程序

问题

当我单独运行这3个程序时，它们运行得很好，但是当我尝试使用cron或python subprocess.Popen方法运行程序时，3个程序中的一个在row变量中返回1，即使数据库是空的，也会从一开始就终止程序

我不知道为什么每当我试图同时运行这些程序时，row变量变为1并跳过3个程序中的一个

***我正在尝试的并发方法***

这会同时运行13个程序，并跳过AppFlatForRent.py，因为行变量中返回1并终止程序

导入子流程导入操作系统

scripts_to_run = ['CommercialForSale.py','AppFlatForRent.py','AppForSale.py','MultipleUnitsForSale.py','RentalWanted.py','RentCommercial.py','RoomsForRent.py','RoomsWanted.py','ShortTermDaily.py','VillaHouseForRent.py', 'LandForSale.py','ShortTermMonthly.py','VillaHouseForSale.py']



for s in scripts_to_run:
    subprocess.Popen(["python", os.path.join(os.getcwd(), s)])

Cron选项卡-将行变量返回到1并终止后，它还会忽略一个程序AppFlatForRent.py

32 21***/usr/bin/python/home/ahmed/Desktop/scrap/AppFlatForRent.py 32 21***/usr/bin/python/home/ahmed/Desktop/scrap/AppForSale.py 32 21***/usr/bin/python/home/ahmed/Desktop/scrap/CommercialForSale$

上述两种方法都会通过终止其中一个程序来产生这种错误，但当我运行在同时运行时终止的程序时，它运行良好

注

我认为我插入数据的方式有问题所有python程序都完全相同，只是用于提取数据的网页链接不同

请帮忙。我正在努力解决这个问题，从过去的24小时！救命

我建议使用多处理同时运行脚本。您甚至可以通过不将脚本打包到不同的程序中，只更改一个简单的变量来简化整个设置，如下所示：

import multiprocessing as mp
homepage_lis=[google, facebook, twitter]

def data_getter(homepage):
    do_something(homepage)  
    return (result_from_homepage)

def data_saver(result): 
    do_something_with(result) 

def apply_async_with_callback():
    pool = mp.Pool(processes=4)
    for i in len(homepage_lis):
        pool.apply_async(data_getter, args = (homepage_lis[i]), callback = data_saver)
    pool.close()
    pool.join()

if __name__ == '__main__':
    apply_async_with_callback()

根据可用的内核/线程数调整进程号。

如何使用多处理在专用进程中运行所有脚本，并通过cron job计划多处理脚本？上面的cronjob语句中没有错误。有什么证据表明VillaHouseForSale.py没有运行？请查看编辑后的问题。如上所述，它运行时会出现错误，以使事情变得更简单：您能否生成一个显示相同行为的最小示例？

import multiprocessing as mp
homepage_lis=[google, facebook, twitter]

def data_getter(homepage):
    do_something(homepage)  
    return (result_from_homepage)

def data_saver(result): 
    do_something_with(result) 

def apply_async_with_callback():
    pool = mp.Pool(processes=4)
    for i in len(homepage_lis):
        pool.apply_async(data_getter, args = (homepage_lis[i]), callback = data_saver)
    pool.close()
    pool.join()

if __name__ == '__main__':
    apply_async_with_callback()