Python 熊猫:忽略lambda中的异常
我有一个由pdf URL组成的数据框架Python 熊猫:忽略lambda中的异常,python,pandas,exception,lambda,Python,Pandas,Exception,Lambda,我有一个由pdf URL组成的数据框架 Source 0 http://www.ampire.com.tw/en/download.asp?fileN... 1 http://www.ampire.com.tw/en/download.asp?fileN... 2 http://www.buckeyeshapeform.com/media/1240/iso... 3 http://www.ioni
Source
0 http://www.ampire.com.tw/en/download.asp?fileN...
1 http://www.ampire.com.tw/en/download.asp?fileN...
2 http://www.buckeyeshapeform.com/media/1240/iso...
3 http://www.ionix-systems.com/files/EN91002009I...
4 http://php2.twinner.com.tw/files/chiplus/ISO90...
我创建了一个函数,将这些URL转换为图像,然后对它们进行OCR并返回特定行,我希望该行填充一个新列
我就是这么做的:
import cv2
import requests
import pdf2image
import pytesseract
import dateutil.parser as dparser
import pandas as pd
import numpy as np
def address(x):
pdf = requests.get(x,stream=True)
images = pdf2image.convert_from_bytes(pdf.raw.read())
sora = 'sora.png'
images[0].save(sora, 'PNG')
img = cv2.imread(sora, cv2.IMREAD_COLOR)
#img = cv2.blur(img, (5, 5))
#HSV (hue, saturation, value)
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv)
cv2.imwrite('temp.png',v)
p = pytesseract.image_to_string(cv2.imread('temp.png'))
for line in p.lower().split('\n'):
if ':20' in line and ('iso' in line or 'iatf' in line or '1so' in line):
return(line.title())
elif ': 20' in line and ('iso' in line or 'iatf' in line or '1so' in line):
return(line.title())
我想用lambda运行一些东西,如果URL关闭或者不包含pdf或任何类似的错误,它只需在“B”列中键入“Down”,如下所示
df['B'] = df['Source'].apply(lambda x: "Down" if Exception else address(x))
正确的方法是什么?我会检查您的
请求的响应。get()
,如果响应不符合预期,则返回Down
。它可能看起来像:
import cv2
import requests
import pdf2image
import pytesseract
import dateutil.parser as dparser
import pandas as pd
import numpy as np
def address(x):
pdf = requests.get(x,stream=True)
if pdf.status_code != 200: # You could be even more specific here
return "Down"
else:
images = pdf2image.convert_from_bytes(pdf.raw.read())
sora = 'sora.png'
images[0].save(sora, 'PNG')
img = cv2.imread(sora, cv2.IMREAD_COLOR)
#img = cv2.blur(img, (5, 5))
#HSV (hue, saturation, value)
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv)
cv2.imwrite('temp.png',v)
p = pytesseract.image_to_string(cv2.imread('temp.png'))
for line in p.lower().split('\n'):
if ':20' in line and ('iso' in line or 'iatf' in line or '1so' in line):
return(line.title())
elif ': 20' in line and ('iso' in line or 'iatf' in line or '1so' in line):
return(line.title())
我将检查您的
请求的响应。get()
,如果响应不符合预期,则返回Down
。它可能看起来像:
import cv2
import requests
import pdf2image
import pytesseract
import dateutil.parser as dparser
import pandas as pd
import numpy as np
def address(x):
pdf = requests.get(x,stream=True)
if pdf.status_code != 200: # You could be even more specific here
return "Down"
else:
images = pdf2image.convert_from_bytes(pdf.raw.read())
sora = 'sora.png'
images[0].save(sora, 'PNG')
img = cv2.imread(sora, cv2.IMREAD_COLOR)
#img = cv2.blur(img, (5, 5))
#HSV (hue, saturation, value)
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv)
cv2.imwrite('temp.png',v)
p = pytesseract.image_to_string(cv2.imread('temp.png'))
for line in p.lower().split('\n'):
if ':20' in line and ('iso' in line or 'iatf' in line or '1so' in line):
return(line.title())
elif ': 20' in line and ('iso' in line or 'iatf' in line or '1so' in line):
return(line.title())
我做错了,我试着添加try,除了用一种愚蠢的方式
def address2(x):
for link in x:
try:
pdf = requests.get(link,stream=True)
images = pdf2image.convert_from_bytes(pdf.raw.read())
sora = 'sora.png'
images[0].save(sora, 'PNG')
img = cv2.imread(sora, cv2.IMREAD_COLOR)
#img = cv2.blur(img, (5, 5))
#HSV (hue, saturation, value)
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv)
cv2.imwrite('temp.png',v)
p = pytesseract.image_to_string(cv2.imread('temp.png'))
for line in p.split('\n\n'):
for country in pycountry.countries:
if country.name in line and('Quality' not in line) and ('Certificat' not in line):
return(line.title())
elif country.alpha_3 in line and('Quality' not in line) and ('Certificat' not in line):
return(line.title())
except:
return("Down")
但我把它改成:
import cv2
import requests
import pdf2image
import pytesseract
import dateutil.parser as dparser
import pandas as pd
import numpy as np
def address(x):
try:
pdf = requests.get(x,stream=True)
images = pdf2image.convert_from_bytes(pdf.raw.read())
sora = 'sora.png'
images[0].save(sora, 'PNG')
img = cv2.imread(sora, cv2.IMREAD_COLOR)
#img = cv2.blur(img, (5, 5))
#HSV (hue, saturation, value)
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv)
cv2.imwrite('temp.png',v)
p = pytesseract.image_to_string(cv2.imread('temp.png'))
for line in p.lower().split('\n'):
if ':20' in line and ('iso' in line or 'iatf' in line or '1so' in line):
return(line.title())
elif ': 20' in line and ('iso' in line or 'iatf' in line or '1so' in line):
return(line.title())
except:
return("Down")
现在没事了我做错了,我试着用一种愚蠢的方式添加try和except
def address2(x):
for link in x:
try:
pdf = requests.get(link,stream=True)
images = pdf2image.convert_from_bytes(pdf.raw.read())
sora = 'sora.png'
images[0].save(sora, 'PNG')
img = cv2.imread(sora, cv2.IMREAD_COLOR)
#img = cv2.blur(img, (5, 5))
#HSV (hue, saturation, value)
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv)
cv2.imwrite('temp.png',v)
p = pytesseract.image_to_string(cv2.imread('temp.png'))
for line in p.split('\n\n'):
for country in pycountry.countries:
if country.name in line and('Quality' not in line) and ('Certificat' not in line):
return(line.title())
elif country.alpha_3 in line and('Quality' not in line) and ('Certificat' not in line):
return(line.title())
except:
return("Down")
但我把它改成:
import cv2
import requests
import pdf2image
import pytesseract
import dateutil.parser as dparser
import pandas as pd
import numpy as np
def address(x):
try:
pdf = requests.get(x,stream=True)
images = pdf2image.convert_from_bytes(pdf.raw.read())
sora = 'sora.png'
images[0].save(sora, 'PNG')
img = cv2.imread(sora, cv2.IMREAD_COLOR)
#img = cv2.blur(img, (5, 5))
#HSV (hue, saturation, value)
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv)
cv2.imwrite('temp.png',v)
p = pytesseract.image_to_string(cv2.imread('temp.png'))
for line in p.lower().split('\n'):
if ':20' in line and ('iso' in line or 'iatf' in line or '1so' in line):
return(line.title())
elif ': 20' in line and ('iso' in line or 'iatf' in line or '1so' in line):
return(line.title())
except:
return("Down")
现在一切都好了为什么不从函数返回“Down”?查看
try except
块。在images=pdf2image.convert_from_bytes(pdf.raw.read())
行中出现异常。它没有继续到最后,我尝试以try的形式制作它,但仍然没有得到任何结果。我做得不对。请展示您尝试的内容感谢您的指点,当我在粘贴之前阅读函数时,我意识到我是多么的愚蠢,为什么不直接从你的函数返回“Down”
?查看try except
块。在images=pdf2image.convert_from_bytes(pdf.raw.read())
行中出现异常。它没有继续到最后,我尝试以try的形式制作它,但仍然没有得到任何结果。我做得不对。请展示您尝试的内容感谢您的指点,当我在粘贴函数之前阅读函数时,我意识到我是多么愚蠢,你应该尽量避免使用这样一个通用的块,除了:
块。相反,只捕获预期的异常,例如except:TheRelationExceptionType
。这是因为,如果您的try
-块中有10多行出现任何其他故障,您可能不会注意到,因为您的非常普通的,除了之外,都会吞下它。这可能会导致非常意外的行为。您应该尽量避免使用这样的常规,除了:
块。相反,只捕获预期的异常,例如except:TheRelationExceptionType
。这是因为,如果您的try
-块中有10多行出现任何其他故障,您可能不会注意到,因为您的非常普通的,除了之外,都会吞下它。这可能会导致非常意外的行为。