import os def CreateAlleleFrequenciesFileFromVCFFilename(pathToVCFTools=None, VCFFilename=None): basename = GetBasenameOfFilename(VCFFilename) extension = GetExtensionOfFilename(VCFFilename) pathOfFilename = GetPathOfFilename(VCFFilename) outputFilePathAndBasename = pathOfFilename + "/" + basename command = pathToVCFTools + " --vcf " + VCFFilename + " --out " + outputFilePathAndBasename + " --freq" print command os.system(command) def CreateAlleleFrequencyAnnotationFilesForTabularFilenamesFromVCFFilenames( pathToVCFTools=None, listOfVCFFiles=None, listOfFilenamesToBeAnnotated=None, outputPreffix="_AlleleFrequency.txt", xapianIndexDirectory=None ): if xapianIndexDirectory == None: xapianIndexDirectory = GetTemporaryDirectory() print "Creating allele frequencies.." alleleFrequencyFiles = [] for VCFFile in listOfVCFFiles: path = GetPathOfFilename(VCFFile) basename = GetBasenameOfFilename(VCFFile) outputFile = path + "/" + basename + ".frq" alleleFrequencyFiles += [outputFile] CreateAlleleFrequenciesFileFromVCFFilename(pathToVCFTools, VCFFile) print "Done creating allele frequencies" print "Creating Xapian Index.." MakeDirectoryIgnoreIfExists(xapianIndexDirectory) CreateXapianIndexFromListOfTabularFilenames( listOfTabularFilenames=alleleFrequencyFiles, listOfKeyColumns=[0,1], listOfRestColumns=[2,3,4,5], addNameOfFile=True, addBasenameOfFile=True, xapianIndexDirectory=xapianIndexDirectory ) print "Done Creating Xapian Index" print "Creating annotation files from the xapian index.." header = "chr\tposition" for filename in alleleFrequencyFiles: header += "\tNumber of chromosomes\t1st Allele Frequency\t2nd Allele Frequency\t"+ filename CreateAnnotationFilesFromXapianIndexCreatedFromGroupOfFiles( xapianIndexDirectory=xapianIndexDirectory, listOfFilenamesToBeAnnotated=listOfFilenamesToBeAnnotated, numberOfFirstLinesToIgnoreInFilesToBeAnnotated=1, listOfKeyColumnsOfFilesToBeAnnotated=[2,3], listOfAnnotationColumnsOfIndex=[2,3,4,5], header=header, listOfFilesWhereTheAnnotationWasCreatedFrom=alleleFrequencyFiles, prefixOfOutputFilename=outputPreffix ) print "Done creating annotation files from the xapian index" def CreateAnnotationFilesFromXapianIndexCreatedFromGroupOfFiles( xapianIndexDirectory=None, listOfFilenamesToBeAnnotated=None, numberOfFirstLinesToIgnoreInFilesToBeAnnotated=0, listOfKeyColumnsOfFilesToBeAnnotated=None, listOfAnnotationColumnsOfIndex=None, keyConnector="--", header=None, listOfFilesWhereTheAnnotationWasCreatedFrom=None, prefixOfOutputFilename=None ): xapianIndex = XapianIndex(xapianIndexDirectory) countOfFilesWhereTheAnnotationWasCreatedFrom = len(listOfFilesWhereTheAnnotationWasCreatedFrom) countOfAnnotationColumnsOfIndex = len(listOfAnnotationColumnsOfIndex) countOfKeyColumnsOfFilesToBeAnnotated = len(listOfKeyColumnsOfFilesToBeAnnotated) basenameOfFilesWhereTheAnnotationWasCreatedFrom = [GetBasenameOfFilename(x) for x in listOfFilesWhereTheAnnotationWasCreatedFrom] annotationLength = (countOfFilesWhereTheAnnotationWasCreatedFrom * countOfAnnotationColumnsOfIndex) + countOfKeyColumnsOfFilesToBeAnnotated for filenameToBeAnnotated in listOfFilenamesToBeAnnotated: print "Annotating file: ", filenameToBeAnnotated fileToBeAnnotated = open(filenameToBeAnnotated) pathOfFileToBeAnnotated = GetPathOfFilename(filenameToBeAnnotated) basenameOfFileToBeAnnotated = GetBasenameOfFilename(filenameToBeAnnotated) outputFilename = pathOfFileToBeAnnotated + "/" + basenameOfFileToBeAnnotated + prefixOfOutputFilename print "Writing to output file: ", outputFilename outputFile = open(outputFilename, "w") if header != None: outputFile.write(header + "\n") linesCount = 0 notFound = 0 while (True): lineOfFileToBeAnnotated = fileToBeAnnotated.readline() if lineOfFileToBeAnnotated == "": break linesCount += 1 if linesCount <= numberOfFirstLinesToIgnoreInFilesToBeAnnotated: continue if linesCount % 1000 == 0: print "Lines parsed: ", linesCount lineOfFileToBeAnnotatedSplitted = lineOfFileToBeAnnotated.replace("\n", "").split("\t") toPrint = [''] * annotationLength #Forming query key = [] for index, keyColumn in enumerate(listOfKeyColumnsOfFilesToBeAnnotated): cell = "" if keyColumn < len(lineOfFileToBeAnnotatedSplitted): cell = lineOfFileToBeAnnotatedSplitted[keyColumn] key += [cell] toPrint[index] = cell query = str.join(keyConnector, key) searchResults = xapianIndex.search(query, maxHits=countOfFilesWhereTheAnnotationWasCreatedFrom) for searchResult in searchResults: #From which file this result is coming from? Take the last record searchResultSplitted = searchResult.split(" ") searchResultsKey = searchResultSplitted[0] searchResultLength = len(searchResultSplitted) originalFilename = searchResultSplitted[searchResultLength-1] indexOfFileOfResult = basenameOfFilesWhereTheAnnotationWasCreatedFrom.index(originalFilename) for index, annotationColumnOfIndex in enumerate(listOfAnnotationColumnsOfIndex): toPrint[2 + (indexOfFileOfResult*countOfAnnotationColumnsOfIndex) + index] = searchResultSplitted[annotationColumnOfIndex] outputFile.write(str.join("\t", toPrint) + "\n") outputFile.close() def CreateXapianIndexFromListOfTabularFilenames( listOfTabularFilenames=None, listOfKeyColumns=None, listOfRestColumns=None, keyConnector = "--", addNameOfFile=False, addBasenameOfFile=False, xapianIndexDirectory=None ): if xapianIndexDirectory==None: xapianIndexDirectory=GetTemporaryDirectory() xapianIndex = XapianIndex(xapianIndexDirectory) for tabularFilename in listOfTabularFilenames: print "Storing in index the file: ", tabularFilename tabularFile = open(tabularFilename) lineCounter = 0 while True: lineOfTabularFile = tabularFile.readline() if lineOfTabularFile == "": break lineCounter += 1 if lineCounter % 1000 == 0: print "Lines parsed: ", lineCounter lineOfTabularFileSplitted = lineOfTabularFile.replace("\n", "").split("\t") key = [] for keyColumn in listOfKeyColumns: cell = "" if keyColumn < len(lineOfTabularFileSplitted): cell = lineOfTabularFileSplitted[keyColumn] key += [cell] rest = [] for restColumn in listOfRestColumns: cell = "" if restColumn < len(lineOfTabularFileSplitted): cell = lineOfTabularFileSplitted[restColumn] rest += [cell] record = str.join(keyConnector, key) + " " + str.join(" ", rest) if addNameOfFile: if addBasenameOfFile: record += " " + GetBasenameOfFilename(tabularFilename) else: record += " " + tabularFilename xapianIndex.addRecord(record) return xapianIndexDirectory import os def GetBasenameOfFilename(filename=None): return os.path.basename(os.path.splitext(filename)[0]) import os def GetExtensionOfFilename(filename=None): return os.path.splitext(filename)[1] import os def GetPathOfFilename(filename=None): return os.path.dirname(filename) import os def MkdirIgnoreExists(dir_name=None): try: os.mkdir(dir_name) except OSError as inst: if str(inst).find("File exists") > -1: print "Warning: Dir: " + dir_name + " already exists" else: raise inst MakeDirectoryIgnoreIfExists = MkdirIgnoreExists import tempfile def RequestTemporaryDirectory(): return tempfile.mkdtemp() GetTemporaryDirectory = RequestTemporaryDirectory try: import xapian except ImportError as inst: print "xapian has not been installed. Please refer to http://xapian.org/download for downloading and installing. Exiting.." raise inst class XapianIndex: def __init__(self, databaseDirectory, stemming=None): # Open the database for update, creating a new database if necessary. self.database = xapian.WritableDatabase(databaseDirectory, xapian.DB_CREATE_OR_OPEN) self.indexer = xapian.TermGenerator() if stemming != None: stemmer = xapian.Stem("english") indexer.set_stemmer(stemmer) def addRecord(self, record): doc = xapian.Document() doc.set_data(record) self.indexer.set_document(doc) self.indexer.index_text(record) # Add the document to the database. self.database.add_document(doc) def search(self, term, maxHits=1, stemming=None): # Start an enquire session. enquire = xapian.Enquire(self.database) qp = xapian.QueryParser() if stemming != None: stemmer = xapian.Stem(stemming) qp.set_stemmer(stemmer) qp.set_database(self.database) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = qp.parse_query(term) # Find the top result for the query. enquire.set_query(query) matches = enquire.get_mset(0, maxHits) toReturn = [] for m in matches: #print "%i: %i%% docid=%i [%s]" % (m.rank + 1, m.percent, m.docid, m.document.get_data()) toReturn += [m.document.get_data()] return toReturn