与使用cron或subprocess的其他python程序同时使用时,python程序的结果是错误的,但单独运行时效果良好
我面临一个奇怪的问题 首先,我将解释代码的用途 这是一个用python编写的网页垃圾代码,它从网站中提取信息并将它们插入我的本地主机 整个计划就是这样与使用cron或subprocess的其他python程序同时使用时,python程序的结果是错误的,但单独运行时效果良好,python,concurrency,cron,web-scraping,beautifulsoup,Python,Concurrency,Cron,Web Scraping,Beautifulsoup,我面临一个奇怪的问题 首先,我将解释代码的用途 这是一个用python编写的网页垃圾代码,它从网站中提取信息并将它们插入我的本地主机 整个计划就是这样 #!/usr/bin/python import urllib from bs4 import BeautifulSoup import MySQLdb import re import pdb import sys def getting_urls_of_all_pages(): i=1 while i<=
#!/usr/bin/python
import urllib
from bs4 import BeautifulSoup
import MySQLdb
import re
import pdb
import sys
def getting_urls_of_all_pages():
i=1
while i<=40: #40 is the total number of main pages
url_rent_flat='http://dubai.dubizzle.com/property-for-rent/residential/apartmentflat/?page='+str(i) #url of the main page (iterating to 40)
link=[]
htmlfile=urllib.urlopen(url_rent_flat).read()
soup=BeautifulSoup(htmlfile)
link=soup.find_all('a',xtclib=re.compile("listing_list_\d+_title_link"),href=True) #stores all the links (25) links of the page
"""
Part 2: passing each property url to process for data extraction
"""
for a in link:
every_property_in_a_page_data_extraction(a['href'])
i+=1
def every_property_in_a_page_data_extraction(url):
# global count_prop
# count_curr=""
# date_result=""
title_result=""
price_result=""
bedroom_result=""
agencyfee_result=""
bathroom_result=""
size_result=""
propertyref_result=""
furnished_result=""
rent_is_paid_result=""
building_result=""
Amenities_result=""
tradename_result=""
licencenum_result=""
reraid_result=""
phone_result=""
link_result=""
Zoned_for_result=""
Freehold_result=""
Pricepersq_result=""
Type_result="Apartment"
Sale_Rent_result="Rent"
rows=0
"""
Part1: Extracting data
"""
htmlfile=urllib.urlopen(url).read()
soup=BeautifulSoup(htmlfile)
"""
Part2: Extracting the components
"""
#date
# try:
# date=soup.find("h3","listing-details-header")
# date_result= str(date.get_text().encode("utf-8").strip()[20:])
# print "\nPublished date: ", date_result
# except StandardError as e:
# phone_result="Error was {0}".format(e)
# print phone_result
# Sale/Rent
print "Sale/Rent: ", Sale_Rent_result
# Type of property
print "Type of property: ", Type_result
#title
try:
title= soup.find('span',{'id':'listing-title-wrap'})
title_result= str(title.get_text().strip().encode("utf-8"))
print "Title: ",title_result
except StandardError as e:
title_result="Error was {0}".format(e)
print title_result
#price
try:
price = soup.find('span',{'id':'actualprice'})
price_result= str(price.get_text())
print "Price: ",price_result
except StandardError as e:
price_result="Error was {0}".format(e)
print price_result
#Agency Fee, Bedroom, Bathroom, Size
spans_ABBS= []
for a in soup.select(".important-fields li span"):
spans_ABBS.append(a.text.strip())
strongs_ABBS=[]
for a in soup.select(".important-fields li strong"):
strongs_ABBS.append(a.text.strip())
for name, value in zip(spans_ABBS, strongs_ABBS):
if name=="Agency Fees:":
try:
agencyfee_result= str(value)
print "Agency Fees: ", agencyfee_result
except StandardError as e:
agencyfee_result="Error was {0}".format(e)
print agencyfee_result
elif name=="Bedrooms:":
try:
bedroom_result= str(value)
print "Number of Bedrooms: ",bedroom_result
except StandardError as e:
bedroom_result="Error was {0}".format(e)
print bedroom_result
elif name=="Bathrooms:":
try:
bathroom_result= str(value)
print "Number of Bathrooms: ", bathroom_result
except StandardError as e:
bathroom_result="Error was {0}".format(e)
print bathroom_result
elif name=="Size:":
try:
size_result= str(value)
print "Size of the property: ",size_result
except StandardError as e:
size_result="Error was {0}".format(e)
print size_result
elif name=="Zoned For:":
try:
Zoned_for_result= str(value)
print "Zoned For:",Zoned_for_result
except StandardError as e:
Zoned_for_result="Error was {0}".format(e)
print Zoned_for_result
elif name=="Freehold:":
try:
Freehold_result= str(value)
print "Freehold: ",Freehold_result
except StandardError as e:
Freehold_result="Error was {0}".format(e)
print Freehold_result
elif name=="Price / SqFt:":
try:
Pricepersq_result= str(value)
print "Price Per Sqft: ",Pricepersq_result
except StandardError as e:
Pricepersq_result="Error was {0}".format(e)
print Pricepersq_result
#Property Reference, Furnished, Listed By, Rent Is Paid, Building, Amenities:
spans_others=[]
for a in soup.select("#listing-details-list li span"):
spans_others.append(a.text.strip())
strongs_others=[]
for a in soup.select("#listing-details-list li strong"):
strongs_others.append(a.text.strip())
for name, value in zip(spans_others, strongs_others):
if name=="Listed by:":
break
elif name=="Property Reference:":
try:
propertyref_result=str(value.strip())
print "Property reference in Dubizel: ",propertyref_result
except StandardError as e:
propertyref_result="Error was {0}".format(e)
print propertyref_result
elif name=="Furnished:":
try:
furnished_result=str(value.strip())
print "Furnished status: ",furnished_result
except StandardError as e:
furnished_result="Error was {0}".format(e)
print furnished_result
elif name=="Rent Is Paid:":
try:
rent_is_paid_result=str(value.strip())
print "Rent payment: ",rent_is_paid_result
except StandardError as e:
rent_is_paid_result="Error was {0}".format(e)
print rent_is_paid_result
elif name=="Building:":
try:
building_result=str(value.strip())
print "Building info: ",building_result
except StandardError as e:
building_result="Error was {0}".format(e)
print building_result
elif name=="Amenities:":
try:
for a in value.split(","):
Amenities_result+=a.strip()+","
print Amenities_result
except StandardError as e:
Amenities_result="Error was {0}".format(e)
print Amenities_result
#Agents info --> TTrade Name, DED Licence Number, RERA Registration Number
spans_broker=[]
for a in soup.select("#broker-details li span"):
spans_broker.append(a.text.strip())
strongs_broker=[]
for a in soup.select("#broker-details li strong"):
strongs_broker.append(a.text.strip())
for name, value in zip(spans_broker, strongs_broker):
if name=="Trade Name:":
try:
tradename_result=str(value.strip())
print "Trade name: ",tradename_result
except StandardError as e:
tradename_result="Error was {0}".format(e)
print tradename_result
elif name=="DED Licence Number:":
try:
licencenum_result=str(value.strip())
print "Licence #: ",licencenum_result
except StandardError as e:
licencenum_result="Error was {0}".format(e)
print licencenum_result
elif name=="RERA Registration Number:":
try:
reraid_result=str(value.strip())
print "RERA ID #: ",reraid_result
except StandardError as e:
reraid_result="Error was {0}".format(e)
print reraid_result
# phone num
try:
phone=soup.find_all("div", "phone-content")
for a in phone:
phone_result= str(a.get_text().strip().encode("utf-8"))
print "Phone information:", phone_result
except StandardError as e:
phone_result="Error was {0}".format(e)
print phone_result
#link
try:
link = soup.find('input',{'id':'short-link-input'})
link_result= str(link.get('value'))
print "Short Reference link: ", link_result
except StandardError as e:
link_result="Error was {0}".format(e)
print link_result
#double check of the types before conversion
# print map(type, (date_result, Sale_Rent_result,Type_result, title_result, price_result, Pricepersq_result, bedroom_result, agencyfee_result, bathroom_result, size_result, propertyref_result, furnished_result, rent_is_paid_result, building_result, Amenities_result, tradename_result, licencenum_result, reraid_result, phone_result, link_result))
# count_prop+=1
"""
Connecting to Database and putting data into in
"""
db= MySQLdb.connect("localhost","root","ahmed","practice")
cursor=db.cursor()
#checking phase to stop scrapping
sql = """SELECT Short_link FROM Properties WHERE Short_link=%s"""
print rows
rows = cursor.execute(sql,(link_result))
print rows
if rows>=1:
print "Already present - The program is terminating"
sys.exit()
else:
query="""INSERT INTO Properties (Sale_Rent, Type, Title,Price, PricePerSqrFt, Bedroom,Agency_Fee, Bathroom, Size,ZonedFor, Freehold, Prop_ref,Furnished_status,Rent_payment,Building_info,Amenities,Trade_name,Licence, RERA_ID,Phone_info,Short_link) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
cursor.execute(query,(Sale_Rent_result,Type_result, title_result, price_result, Pricepersq_result, bedroom_result, agencyfee_result, bathroom_result, size_result,Zoned_for_result, Freehold_result, propertyref_result, furnished_result, rent_is_paid_result, building_result, Amenities_result, tradename_result, licencenum_result, reraid_result, phone_result, link_result))
db.commit()
cursor.close()
db.close()
#-----------------------------------------------------------
# count_prop=1
getting_urls_of_all_pages()
现在,我有3个不同的python程序,它们对3个不同的网站进行相同的提取。我必须同时运行这3个名为AppFlatForRent.py、AppFlatForRent.py、AppFlatForRent.py的程序
问题
当我单独运行这3个程序时,它们运行得很好,但是当我尝试使用cron或python subprocess.Popen方法运行程序时,3个程序中的一个在row变量中返回1,即使数据库是空的,也会从一开始就终止程序
我不知道为什么每当我试图同时运行这些程序时,row变量变为1并跳过3个程序中的一个
***我正在尝试的并发方法***
这会同时运行13个程序,并跳过AppFlatForRent.py,因为行变量中返回1并终止程序
导入子流程
导入操作系统
scripts_to_run = ['CommercialForSale.py','AppFlatForRent.py','AppForSale.py','MultipleUnitsForSale.py','RentalWanted.py','RentCommercial.py','RoomsForRent.py','RoomsWanted.py','ShortTermDaily.py','VillaHouseForRent.py', 'LandForSale.py','ShortTermMonthly.py','VillaHouseForSale.py']
for s in scripts_to_run:
subprocess.Popen(["python", os.path.join(os.getcwd(), s)])
Cron选项卡-将行变量返回到1并终止后,它还会忽略一个程序AppFlatForRent.py
32 21***/usr/bin/python/home/ahmed/Desktop/scrap/AppFlatForRent.py
32 21***/usr/bin/python/home/ahmed/Desktop/scrap/AppForSale.py
32 21***/usr/bin/python/home/ahmed/Desktop/scrap/CommercialForSale$
上述两种方法都会通过终止其中一个程序来产生这种错误,但当我运行在同时运行时终止的程序时,它运行良好
注
我认为我插入数据的方式有问题
所有python程序都完全相同,只是用于提取数据的网页链接不同
请帮忙。我正在努力解决这个问题,从过去的24小时!救命 我建议使用多处理同时运行脚本。您甚至可以通过不将脚本打包到不同的程序中,只更改一个简单的变量来简化整个设置,如下所示:
import multiprocessing as mp
homepage_lis=[google, facebook, twitter]
def data_getter(homepage):
do_something(homepage)
return (result_from_homepage)
def data_saver(result):
do_something_with(result)
def apply_async_with_callback():
pool = mp.Pool(processes=4)
for i in len(homepage_lis):
pool.apply_async(data_getter, args = (homepage_lis[i]), callback = data_saver)
pool.close()
pool.join()
if __name__ == '__main__':
apply_async_with_callback()
根据可用的内核/线程数调整进程号。如何使用多处理在专用进程中运行所有脚本,并通过cron job计划多处理脚本?上面的cronjob语句中没有错误。有什么证据表明VillaHouseForSale.py没有运行?请查看编辑后的问题。如上所述,它运行时会出现错误,以使事情变得更简单:您能否生成一个显示相同行为的最小示例?
import multiprocessing as mp
homepage_lis=[google, facebook, twitter]
def data_getter(homepage):
do_something(homepage)
return (result_from_homepage)
def data_saver(result):
do_something_with(result)
def apply_async_with_callback():
pool = mp.Pool(processes=4)
for i in len(homepage_lis):
pool.apply_async(data_getter, args = (homepage_lis[i]), callback = data_saver)
pool.close()
pool.join()
if __name__ == '__main__':
apply_async_with_callback()