Python应该返回多个参数_Python

Python应该返回多个参数

python

Python应该返回多个参数,python,Python,您好，我是python初学者，正在尝试执行此程序为集合文件创建反向索引： import sys import re from porterStemmer import PorterStemmer from collections import defaultdict from array import array import gc porter=PorterStemmer() class CreateIndex: def __init__(self): self.index=de

您好，我是python初学者，正在尝试执行此程序为集合文件创建反向索引：

import sys
import re
from porterStemmer import PorterStemmer
from collections import defaultdict
from array import array
import gc

porter=PorterStemmer()

class CreateIndex:

def __init__(self):
    self.index=defaultdict(list)    #the inverted index


def getStopwords(self):
    '''get stopwords from the stopwords file'''
    f=open(self.stopwordsFile, 'r')
    stopwords=[line.rstrip() for line in f]
    self.sw=dict.fromkeys(stopwords)
    f.close()


def getTerms(self, line):
    '''given a stream of text, get the terms from the text'''
    line=line.lower()
    line=re.sub(r'[^a-z0-9 ]',' ',line) #put spaces instead of non-alphanumeric characters
    line=line.split()
    line=[x for x in line if x not in self.sw]  #eliminate the stopwords
    line=[ porter.stem(word, 0, len(word)-1) for word in line]
    return line


def parseCollection(self):
    ''' returns the id, title and text of the next page in the collection '''
    doc=[]
    for line in self.collFile:
        if line=='</page>\n':
            break
        doc.append(line)

    curPage=''.join(doc)
    pageid=re.search('<id>(.*?)</id>', curPage, re.DOTALL)
    pagetitle=re.search('<title>(.*?)</title>', curPage, re.DOTALL)
    pagetext=re.search('<text>(.*?)</text>', curPage, re.DOTALL)

    if pageid==None or pagetitle==None or pagetext==None:
        return {}

    d={}
    d['id']=pageid.group(1)
    d['title']=pagetitle.group(1)
    d['text']=pagetext.group(1)

    return d


def writeIndexToFile(self):
    '''write the inverted index to the file'''
    f=open(self.indexFile, 'w')
    for term in self.index.iterkeys():
        postinglist=[]
        for p in self.index[term]:
            docID=p[0]
            positions=p[1]
            postinglist.append(':'.join([str(docID) ,','.join(map(str,positions))]))
        print >> f, ''.join((term,'|',';'.join(postinglist)))

    f.close()


def getParams(self):
    '''get the parameters stopwords file, collection file, and the output index file'''
    param=sys.argv
    self.stopwordsFile=param[0]
    self.collectionFile=param[1]
    self.indexFile=param[2]


def createIndex(self):
    '''main of the program, creates the index'''
    self.getParams()
    self.collFile=open(self.collectionFile,'r')
    self.getStopwords()

    #bug in python garbage collector!
    #appending to list becomes O(N) instead of O(1) as the size grows if gc is enabled.
    gc.disable()

    pagedict={}
    pagedict=self.parseCollection()
    #main loop creating the index
    while pagedict != {}:                    
        lines='\n'.join((pagedict['title'],pagedict['text']))
        pageid=int(pagedict['id'])
        terms=self.getTerms(lines)

        #build the index for the current page
        termdictPage={}
        for position, term in enumerate(terms):
            try:
                termdictPage[term][1].append(position)
            except:
                termdictPage[term]=[pageid, array('I',[position])]

        #merge the current page index with the main index
        for termpage, postingpage in termdictPage.iteritems():
            self.index[termpage].append(postingpage)

        pagedict=self.parseCollection()


    gc.enable()

    self.writeIndexToFile()


if __name__=="__main__":
c=CreateIndex()
c.createIndex()

导入系统进口稀土从porterStemmer导入porterStemmer 从集合导入defaultdict 从数组导入数组导入gc 波特类创建索引：定义初始化（自）： self.index=defaultdict（list）#反向索引 def getStopwords（self）： ''从stopwords文件''中获取stopwords' f=打开（self.stopwordsFile'r'） stopwords=[line.rstrip（）表示f中的行] self.sw=dict.fromkeys（停止字） f、关闭（） def getTerms（self，line）： ''给定一个文本流，从文本中获取术语'' line=line.lower（） line=re.sub（r'[^a-z0-9]，''，line）#用空格代替非字母数字字符 line=line.split（） line=[x代表x，如果x不在self.sw中，则x在line中]#删除停止字 line=[porter.stem（单词，0，len（单词）-1）表示行中的单词] 回程线 def parseCollection（自）： ''返回集合中下一页的id、标题和文本'' doc=[] 对于self.coll文件中的行：如果行=='\n'：打破单据追加（行） curPage=''.join（doc） pageid=re.search（'（.*？），curPage，re.DOTALL） pagetitle=re.search（“（.*”），curPage，re.DOTALL） pagetext=re.search（“（.*？”，curPage，re.DOTALL）如果pageid==无或pagetitle==无或pagetext==无：返回{} d={} d['id']=pageid.group（1） d['title']=pagetitle.group（1） d['text']=pagetext.group（1）返回d def写入索引配置文件（自身）： ''将反向索引写入文件'' f=打开（self.indexFile，“w”）对于self.index.iterkeys（）中的术语：发布列表=[] 对于自索引[术语]中的p： docID=p[0] 职位=p[1] postinglist.append（“：”.join（[str（docID），“，”.join（map（str，positions））]））打印>>f'、.join（（术语“|”、“；”.join（发布列表））） f、关闭（） def getParams（自）： ''获取参数stopwords文件、集合文件和输出索引文件'' param=sys.argv self.stopwordsFile=param[0] self.collectionFile=param[1] self.indexFile=param[2] def createIndex（自）： “程序的主体，创建索引” self.getParams（） self.collFile=open（self.collectionFile，'r'） self.getStopwords（） #python垃圾收集器中的bug！ #如果启用了gc，则随着大小的增长，追加到列表的内容将变为O（N），而不是O（1）。 gc.disable（） pagedict={} pagedict=self.parseCollection（） #创建索引的主循环而pagedict！={}: 行='\n'.连接（（pagedict['title']，pagedict['text']）） pageid=int（pagedict['id']）） terms=self.getTerms（行） #为当前页面生成索引 termdictPage={} 对于职位，枚举中的术语（术语）：尝试： termdictPage[术语][1]。追加（职位）除： termdictPage[term]=[pageid，数组（'I'，[position]）] #将当前页面索引与主索引合并对于termpage，在termdictPage.iteritems（）中发布页面： self.index[termpage].append（postingpage） pagedict=self.parseCollection（） gc.enable（） self.writeIndexToFile（）文件如果名称=“\uuuuu main\uuuuuuuu”： c=CreateIndex（） c、 createIndex（）它说sys.argv中只有一个参数

其他参数应该如何显示

在

getParams

函数中，您可以看到您的代码请求3个参数。调用程序时：

python your_program.py
# sys.argv[0] = 'your_program.py'

它有一个参数。因此，您还需要两个：

python your_program.py arg_1 arg_2
# sys.argv[0] = 'your_program.py'
# sys.argv[1] = 'arg_1'
# sys.argv[2] = 'arg_2

使用

python my_program.py argument1 argument2调用函数根据def getParams（self）
方法，程序希望从命令行获取参数。请其他人做你的工作之前，试着理解它（并阅读sys.argv文档）。。。