Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/github/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python应该返回多个参数_Python - Fatal编程技术网

Python应该返回多个参数

Python应该返回多个参数,python,Python,您好,我是python初学者,正在尝试执行此程序为集合文件创建反向索引: import sys import re from porterStemmer import PorterStemmer from collections import defaultdict from array import array import gc porter=PorterStemmer() class CreateIndex: def __init__(self): self.index=de

您好,我是python初学者,正在尝试执行此程序为集合文件创建反向索引:

import sys
import re
from porterStemmer import PorterStemmer
from collections import defaultdict
from array import array
import gc

porter=PorterStemmer()

class CreateIndex:

def __init__(self):
    self.index=defaultdict(list)    #the inverted index


def getStopwords(self):
    '''get stopwords from the stopwords file'''
    f=open(self.stopwordsFile, 'r')
    stopwords=[line.rstrip() for line in f]
    self.sw=dict.fromkeys(stopwords)
    f.close()


def getTerms(self, line):
    '''given a stream of text, get the terms from the text'''
    line=line.lower()
    line=re.sub(r'[^a-z0-9 ]',' ',line) #put spaces instead of non-alphanumeric characters
    line=line.split()
    line=[x for x in line if x not in self.sw]  #eliminate the stopwords
    line=[ porter.stem(word, 0, len(word)-1) for word in line]
    return line


def parseCollection(self):
    ''' returns the id, title and text of the next page in the collection '''
    doc=[]
    for line in self.collFile:
        if line=='</page>\n':
            break
        doc.append(line)

    curPage=''.join(doc)
    pageid=re.search('<id>(.*?)</id>', curPage, re.DOTALL)
    pagetitle=re.search('<title>(.*?)</title>', curPage, re.DOTALL)
    pagetext=re.search('<text>(.*?)</text>', curPage, re.DOTALL)

    if pageid==None or pagetitle==None or pagetext==None:
        return {}

    d={}
    d['id']=pageid.group(1)
    d['title']=pagetitle.group(1)
    d['text']=pagetext.group(1)

    return d


def writeIndexToFile(self):
    '''write the inverted index to the file'''
    f=open(self.indexFile, 'w')
    for term in self.index.iterkeys():
        postinglist=[]
        for p in self.index[term]:
            docID=p[0]
            positions=p[1]
            postinglist.append(':'.join([str(docID) ,','.join(map(str,positions))]))
        print >> f, ''.join((term,'|',';'.join(postinglist)))

    f.close()


def getParams(self):
    '''get the parameters stopwords file, collection file, and the output index file'''
    param=sys.argv
    self.stopwordsFile=param[0]
    self.collectionFile=param[1]
    self.indexFile=param[2]


def createIndex(self):
    '''main of the program, creates the index'''
    self.getParams()
    self.collFile=open(self.collectionFile,'r')
    self.getStopwords()

    #bug in python garbage collector!
    #appending to list becomes O(N) instead of O(1) as the size grows if gc is enabled.
    gc.disable()

    pagedict={}
    pagedict=self.parseCollection()
    #main loop creating the index
    while pagedict != {}:                    
        lines='\n'.join((pagedict['title'],pagedict['text']))
        pageid=int(pagedict['id'])
        terms=self.getTerms(lines)

        #build the index for the current page
        termdictPage={}
        for position, term in enumerate(terms):
            try:
                termdictPage[term][1].append(position)
            except:
                termdictPage[term]=[pageid, array('I',[position])]

        #merge the current page index with the main index
        for termpage, postingpage in termdictPage.iteritems():
            self.index[termpage].append(postingpage)

        pagedict=self.parseCollection()


    gc.enable()

    self.writeIndexToFile()


if __name__=="__main__":
c=CreateIndex()
c.createIndex()
导入系统 进口稀土 从porterStemmer导入porterStemmer 从集合导入defaultdict 从数组导入数组 导入gc 波特 类创建索引: 定义初始化(自): self.index=defaultdict(list)#反向索引 def getStopwords(self): ''从stopwords文件''中获取stopwords' f=打开(self.stopwordsFile'r') stopwords=[line.rstrip()表示f中的行] self.sw=dict.fromkeys(停止字) f、 关闭() def getTerms(self,line): ''给定一个文本流,从文本中获取术语'' line=line.lower() line=re.sub(r'[^a-z0-9],'',line)#用空格代替非字母数字字符 line=line.split() line=[x代表x,如果x不在self.sw中,则x在line中]#删除停止字 line=[porter.stem(单词,0,len(单词)-1)表示行中的单词] 回程线 def parseCollection(自): ''返回集合中下一页的id、标题和文本'' doc=[] 对于self.coll文件中的行: 如果行=='\n': 打破 单据追加(行) curPage=''.join(doc) pageid=re.search('(.*?),curPage,re.DOTALL) pagetitle=re.search(“(.*”),curPage,re.DOTALL) pagetext=re.search(“(.*?”,curPage,re.DOTALL) 如果pageid==无或pagetitle==无或pagetext==无: 返回{} d={} d['id']=pageid.group(1) d['title']=pagetitle.group(1) d['text']=pagetext.group(1) 返回d def写入索引配置文件(自身): ''将反向索引写入文件'' f=打开(self.indexFile,“w”) 对于self.index.iterkeys()中的术语: 发布列表=[] 对于自索引[术语]中的p: docID=p[0] 职位=p[1] postinglist.append(“:”.join([str(docID),“,”.join(map(str,positions))])) 打印>>f'、.join((术语“|”、“;”.join(发布列表))) f、 关闭() def getParams(自): ''获取参数stopwords文件、集合文件和输出索引文件'' param=sys.argv self.stopwordsFile=param[0] self.collectionFile=param[1] self.indexFile=param[2] def createIndex(自): “程序的主体,创建索引” self.getParams() self.collFile=open(self.collectionFile,'r') self.getStopwords() #python垃圾收集器中的bug! #如果启用了gc,则随着大小的增长,追加到列表的内容将变为O(N),而不是O(1)。 gc.disable() pagedict={} pagedict=self.parseCollection() #创建索引的主循环 而pagedict!={}: 行='\n'.连接((pagedict['title'],pagedict['text'])) pageid=int(pagedict['id'])) terms=self.getTerms(行) #为当前页面生成索引 termdictPage={} 对于职位,枚举中的术语(术语): 尝试: termdictPage[术语][1]。追加(职位) 除: termdictPage[term]=[pageid,数组('I',[position])] #将当前页面索引与主索引合并 对于termpage,在termdictPage.iteritems()中发布页面: self.index[termpage].append(postingpage) pagedict=self.parseCollection() gc.enable() self.writeIndexToFile()文件 如果名称=“\uuuuu main\uuuuuuuu”: c=CreateIndex() c、 createIndex() 它说sys.argv中只有一个参数


其他参数应该如何显示

getParams
函数中,您可以看到您的代码请求3个参数。 调用程序时:

python your_program.py
# sys.argv[0] = 'your_program.py'
它有一个参数。因此,您还需要两个:

python your_program.py arg_1 arg_2
# sys.argv[0] = 'your_program.py'
# sys.argv[1] = 'arg_1'
# sys.argv[2] = 'arg_2

使用
python my_program.py argument1 argument2调用函数根据
def getParams(self)
方法,程序希望从命令行获取参数。请其他人做你的工作之前,试着理解它(并阅读sys.argv文档)。。。