Python 使用函数时,列表元素不会分离
我正在编写一些代码,从网站上截取段落。虽然代码有点凌乱和混乱,但我认为它仍然清晰可辨 唯一的问题是我在写的时候遇到了一个大障碍。当这些段落被写出来时,它们似乎与该页面中的段落连接在一起,我希望每个段落都作为其单独的列表元素,而不是作为主列表中较小列表的一部分 我想要的输出是与查询最相关的顶部段落。我已经有了所有的东西来检查哪些段落对查询的可靠性,但是就像我说的,当我把它们写到一个txt文件来检查时,一个页面上的所有段落似乎都在分组在一起 这是我的密码:Python 使用函数时,列表元素不会分离,python,python-3.x,list,beautifulsoup,python-requests,Python,Python 3.x,List,Beautifulsoup,Python Requests,我正在编写一些代码,从网站上截取段落。虽然代码有点凌乱和混乱,但我认为它仍然清晰可辨 唯一的问题是我在写的时候遇到了一个大障碍。当这些段落被写出来时,它们似乎与该页面中的段落连接在一起,我希望每个段落都作为其单独的列表元素,而不是作为主列表中较小列表的一部分 我想要的输出是与查询最相关的顶部段落。我已经有了所有的东西来检查哪些段落对查询的可靠性,但是就像我说的,当我把它们写到一个txt文件来检查时,一个页面上的所有段落似乎都在分组在一起 这是我的密码: #qresultsl is a list
#qresultsl is a list of links
for xa in range(0, qresultsl):
URL=ALLresults[xa].format()
URL=str(URL)
URL=URL.replace("'","")
URL=URL.replace("[","")
URL=URL.replace("]","")
pageURL=URL
try:
pr=requests.get(pageURL, headers=headers)
except:
print("Couldn't scrape ",pageURL)
continue
if pr.status_code==200:
try:
psoup=BeautifulSoup(pr.text, 'html.parser')
paragraphs=[''.join(s.findAll(text=True))for s in
psoup.findAll('p')]
presults.append(paragraphs)
except:
print("Couldn't scrape ", pageURL)
continue
else:
print("Couldn't scrape ",pageURL)
continue
# Results
print("\r")
print(len(presults)," websites scraped of ", numresults)
print(len(presults)," pages of content ready for next phase of processing.")
paraList = []
for i in presults:
#make all keywords one big list
paraList = sum(presults, [])
presults = paraList
cleanparagraphs=[]
rangenum=len(presults)
print(presults)
def cleanresults():
ct=0
for dd in range(0,rangenum):
cleaned=presults[ct]
cleaned=str(cleaned)
cleaned=cleaned.replace("/","")
cleaned=cleaned.replace("]","")
cleaned=cleaned.replace("[","")
cleaned=cleaned.replace("'","")
cleaned=cleaned.replace("\n","")
NEWITEM=cleaned
ct=ct+1
cleanparagraphs.append(NEWITEM)
cleanresults()
presults = cleanparagraphs
paragraphs = []
for z in range(len(presults)):
pagei=presults[z]
pagei=str(pagei)
pagei=pagei.replace("[","")
pagei=pagei.replace("'","")
pagei=pagei.replace("]","")
pageHtml = pagei #i wasn't "stupid".
paragraphs.append(presults[z])
'''with open('paragraphs.txt', 'r') as f:
paragraphs = ast.literal_eval(f.read())
'''
'''for i in paragraphs:
#make all paragraphs one big list
paragraphs = sum(paragraphs, [])'''
resultspara=[]
for le in paragraphs:
if le not in resultspara:
resultspara.append(le)
paragraphs=resultspara
og=len(presults)
nl=len(paragraphs)
removed=og-nl
print(removed, " duplicates removed")
lst = []
cp=0
for para in paragraphs:
lst.append(paragraphs[cp].lower())
cp=cp+1
rem=str(lst)
rem=rem.replace("\r","")
rem=rem.replace("\n","")
rem=rem.replace('"\r\n','')
final_list=rem.split("#####")
phrase1 = query
phrase2 = query2
phrase3 = query3
phrase4 = query4
paragraphs=final_list
ammntRemoved = 0
for i in paragraphs:
if len(i) < 20:
paragraphs.remove(i)
ammntRemoved = ammntRemoved + 1
print("removed " + str(ammntRemoved) + " small lines")
randomVariable = []
for i in paragraphs:
randomVariable = sum(paragraphs, [])
paragraphs = randomVariable
def getRelated(phrase):
splitWords = phrase.split() #split the sentence for proccessing
associatedWords = [splitWords] #add spitwords to associatedWords; associatedWords will be the main variable to add processed words to
finalWords = [] #created the variable that will have the final parsde and deduped list
for word in splitWords:
#get associated words for each word in the phrase
html = requests.get("https://api.wordassociations.net/associations/v1.0/json/search?apikey=8c124543-3a0d-4ac9-b6b4-cda92d7d1411&text="+ word + "&lang=en")
theJson = html.text
source = json.loads(theJson)
try:
associatedWords.append([source["response"][0]["items"][0]["item"],source["response"][0]["items"][1]["item"],source["response"][0]["items"][2]["item"],source["response"][0]["items"][3]["item"],source["response"][0]["items"][4]["item"],source["response"][0]["items"][5]["item"]])
numass=len(associatedWords)
print(numass, " associations found for ", word)
except:
print("tested word - " + word + " - had no asocciations")
for i in associatedWords:
#make all keywords one big list
finalWords = sum(associatedWords, [])
relatedKeywords = []
for word in finalWords:
#make finalwords lowercase
relatedKeywords.append(word.lower())
return finalWords
#took that out and replaced it with a for loop that does them all beforehand
phrase1 = getRelated(query)
phrase2 = getRelated(query2)
phrase3 = getRelated(query3)
phrase4 = getRelated(query4)
topic = {}
subHead1 = {}
subHead2 = {}
subHead3 = {}
def getGoodParagraphs(keywords, dictionary):
global length
for para in paragraphs:
#get the keyword frequencies in each paragraph
x = 0
for keyword in keywords:
added = para.count(keyword)
x = x + added
dictionary[para] = x
#get the lengths of used paragraphs
length = 0
length1 = 0
length2 = 0
length3 = 0
def getLen(lengthVar, dictionary):
for i in range(100):
try:
#count the number of words
lengthVar = lengthVar + len(dictionary[i].split())
except:
#break if theres no more paragraphs in said list
break
getGoodParagraphs(phrase1, topic)
getGoodParagraphs(phrase2, subHead1)
getGoodParagraphs(phrase3, subHead2)
getGoodParagraphs(phrase4, subHead3)
getLen(length, topic)
getLen(length1, subHead1)
getLen(length2, subHead2)
getLen(length3, subHead3)
#sort paragraphs least to greatest
topic = sorted(topic, key=lambda k: topic[k], reverse=True)
subHead1 = sorted(subHead1, key=lambda k: subHead1[k], reverse=True)
subHead2 = sorted(subHead2, key=lambda k: subHead2[k], reverse=True)
subHead3 = sorted(subHead3, key=lambda k: subHead3[k], reverse=True)
def appendTop10(inputList, outputList):
try:
for i in range(3):
outputList.append(inputList[i])
except:
print("> Wasnt able to append all 3 paragraphs")
finalTopic = []
finalSubHead1 = []
finalSubHead2 = []
finalSubHead3 = []
appendTop10(topic, finalTopic)
appendTop10(subHead1, finalSubHead1)
appendTop10(subHead2, finalSubHead2)
appendTop10(subHead3, finalSubHead3)
with open("article.txt", "w") as outputFile:
count=0
count2=0
count3=0
count4=0
for i in finalTopic:
filename = 'text.txt'
with open(filename, mode="w") as outfile: # also, tried mode="rb"
for s in finalTopic:
outfile.write("%s\n" % s)
for s in finalSubHead1:
outfile.write("%s\n" % s)
for s in finalSubHead2:
outfile.write("%s\n" % s)
for s in finalSubHead3:
outfile.write("%s\n" % s)
print("DONE")
#qresultsl是一个链接列表
对于范围(0,qresultsl)内的xa:
URL=ALLresults[xa]。格式()
URL=str(URL)
URL=URL。替换(“,”)
URL=URL。替换(“[”,“”)
URL=URL。替换(“]”,“”)
pageURL=URL
尝试:
pr=requests.get(pageURL,headers=headers)
除:
打印(“无法刮取”,页面URL)
持续
如果pr.status_code==200:
尝试:
psoup=BeautifulSoup(pr.text,'html.parser')
段落=[''.join(s.findAll(text=True))表示中的
psoup.findAll('p')]
预设。附加(段落)
除:
打印(“无法刮取”,页面URL)
持续
其他:
打印(“无法刮取”,页面URL)
持续
#结果
打印(“\r”)
打印(len(预设),“网站被刮去”,numresults)
打印(len(预设),“准备好下一阶段处理的内容页”)
paraList=[]
因为我在假设中:
#将所有关键字列成一个大列表
paraList=sum(假定值,[])
presults=paraList
段落=[]
rangenum=len(假定值)
打印(预设)
def cleanresults():
ct=0
对于范围内的dd(0,rangenum):
清洁=压力[ct]
已清洁=str(已清洁)
已清洁=已清洁。替换(“/”,“”)
已清理=已清理。替换(“]”,“”)
已清理=已清理。替换(“[”,“”)
已清理=已清理。替换(“,”)
已清理=已清理。替换(“\n”和“”)
NEWITEM=已清理
ct=ct+1
清除段落。追加(NEWITEM)
清理结果()
预设值=干净的段落
段落=[]
对于范围内的z(len(presults)):
pagei=压力[z]
pagei=str(pagei)
pagei=pagei.replace(“[”,”)
pagei=pagei.replace(“,”)
pagei=pagei.replace(“]”,“”)
pageHtml=pagei#我不是“愚蠢的”。
段落.附加(假定[z])
“打开('parations.txt',r')作为f:
段落=ast.literal_eval(f.read())
'''
“对于我,在以下段落中:
#把所有段落列成一个大单子
段落=总和(段落,[])“”
结果参数=[]
对于段落中的le:
如果le不在结果段落中:
结果段落附加(le)
段落=结果段落
og=len(压力)
nl=len(段落)
已删除=og nl
打印(已删除,“已删除副本”)
lst=[]
cp=0
对于段落中的段落:
lst.append(段落[cp].lower())
cp=cp+1
rem=str(lst)
rem=rem.replace(“\r”,”)
rem=rem.replace(“\n”和“”)
rem=rem.replace(''\r\n','')
最终列表=rem.split(“#####”)
短语1=查询
短语2=疑问词2
短语3=查询3
短语4=疑问词4
段落=最终列表
ammntRemoved=0
对于第i段:
如果len(i)<20:
删除第(i)款
amntremoved=amntremoved+1
打印(“删除”+str(ammntRemoved)+“小行”)
随机变量=[]
对于第i段:
随机变量=总和(段落,[])
段落=随机变量
定义相关(短语):
splitWords=phrase.split()#拆分句子以进行处理
associatedWords=[splitWords]#将spitwords添加到associatedWords;associatedWords将是添加处理过的单词的主要变量
finalWords=[]#创建了将具有最终parsde和重复数据消除列表的变量
对于拆分中的单词:
#获取短语中每个单词的关联词
html=请求。获取(“https://api.wordassociations.net/associations/v1.0/json/search?apikey=8c124543-3a0d-4ac9-b6b4-cda92d7d1411&text=“+word+”&lang=en”)
theJson=html.text
source=json.load(theJson)
尝试:
关联词。追加([source[“response”][0][“items”][0][“item”]、source[“response”][0][“items”][1][“items”]、source[“response”][0][“items”][2][“items”]、source[“response”][0][“items”][3][“items”]、source 0][“response”][0][“items”][4][items”][4][items”]、source 0][items”][5][items”])
numass=len(关联词)
打印(numass,“找到的关联”,word)
除:
打印(“测试单词-”+单词+“-无意外”)
对于我来说,用关联词:
#将所有关键字列成一个大列表
finalWords=sum(关联词,[])
relatedKeywords=[]
对于最终单词中的单词:
#使finalwords小写
relatedKeywords.append(word.lower())
返回最终单词
#将其取出并替换为一个for循环,该循环预先完成所有这些操作
短语1=getRelated(查询)
短语2=getRelated(查询2)
短语3=getRelated(查询3)
短语4=getRelated(查询4)
主题={}
分目1={}
分目2={}
分目3={}
def GetGoodPages(关键字、字典):
全局长度
对于段落中的段落:
#获取每个段落中的关键词频率
x=0
对于关键字中的关键字:
添加=段落计数(关键字)
x=x+已添加
字典[段落]=x
#获取所用段落的长度
长度=0
长度1=0
长度2=0
长度3=0
def getLen(长度变量,字典):
对于范围(100)内的i:
尝试:
#数一数字数
lengthVar=lengthVar+len(字典[i].split())
除:
#如果所述列表中没有更多段落,则中断
打破
GetGoodPages(短语1,主题)
(第2段,分目1)
(第3句,分目2)
格特古德(ph
page_name = []
for paragraph in page:
page_name.append(paragraph)
for page in page_name:
print(page)