def printWarning(newValue, oldDictionary): print "Warning: same values, ", newValue return oldDictionary def AnnotateListOfChromosomePositionFilesWithGOFromBioMartEnsembl( listOfFilesToAnnotate=None, numberOfFirstLinesToIgnoreInFileToAnnotate=None, chromosomeColumnOfFilesToAnnotate=None, positionColumnOfFilesToAnnotate=None, resolveDuplicateValuesFunctionInFileToBeAnnotated=printWarning, fileWithGOAnnotation=None, fileWithGOAnnotationChromosomeColumn=None, fileWithGOAnnotationStartColumn=None, fileWithGOAnnotationEndColumn=None, columnsWithGOAnnotationComaSeparated=None, numberOfFirstLinesToIgnoreInGOAnnotationFile=None, outputDirectory=None, outputSuffix=None ): for fileToAnnotate in listOfFilesToAnnotate: print "FILE: ", fileToAnnotate annotationOutputFilename = RequestTemporaryFilename() CreateGOAnnotationForFilesFromBioMartEnsemblFile( listOfFilesToAnnotate=[fileToAnnotate], numberOfFirstLinesToIgnore=numberOfFirstLinesToIgnoreInFileToAnnotate, chromosomeColumnOfFilesToAnnotate=chromosomeColumnOfFilesToAnnotate, chromosomeColumnOfFilesToAnnotateApplyFunction=None, positionColumnOfFilesToAnnotate=positionColumnOfFilesToAnnotate, resolveDuplicateValuesFunction=resolveDuplicateValuesFunctionInFileToBeAnnotated, fileWithGOAnnotation=fileWithGOAnnotation, fileWithGOAnnotationChromosomeColumn=fileWithGOAnnotationChromosomeColumn, fileWithGOAnnotationStartColumn=fileWithGOAnnotationStartColumn, fileWithGOAnnotationEndColumn=fileWithGOAnnotationEndColumn, columnsWithGOAnnotation=columnsWithGOAnnotationComaSeparated, numberOfFirstLinesToIgnoreInGOAnnotationFile=numberOfFirstLinesToIgnoreInGOAnnotationFile, outputFilename=annotationOutputFilename ) sortedAnnotationOutputFilename = RequestTemporaryFilename() SortFileAccordingToColumnsByUsingUNIXSort(annotationOutputFilename, "0,1", sortedAnnotationOutputFilename) GroupColumnsOfAFileAccordingToColumns groupedSortedAnnotationFilename = RequestTemporaryFilename() GroupColumnsOfAFileAccordingToColumns( inputFilename=sortedAnnotationOutputFilename, comaSeparatedColumnsToGroup= reduce(lambda x,y: str(x)+","+str(y), [int(x)-2 for x in columnsWithGOAnnotationComaSeparated.split(",")]) , comaSeparatedKeyColumns="0,1", outputFilename=groupedSortedAnnotationFilename ) CreateAnnotationFrom2FilesWith2CommonColumns( filenameToBeAnnotated=fileToAnnotate, numberOfFirstLinesToIgnoreOfFileToBeAnnotated=numberOfFirstLinesToIgnoreInFileToAnnotate, key1ColumnOfFileToBeAnnotated=chromosomeColumnOfFilesToAnnotate, key2ColumnOfFileToBeAnnotated=positionColumnOfFilesToAnnotate, annotationFilename=groupedSortedAnnotationFilename, key1ColumnsOfAnnotationFile=0, key2ColumnsOfAnnotationFile=1, annotationColumnsOfAnnotationFile=reduce(lambda x,y: str(x)+","+str(y), [int(x)-2 for x in columnsWithGOAnnotationComaSeparated.split(",")]), outputHeader="Chr\tPosition\tGO_BiologicalProcess\tGO_BiologicalProcess_Description\tGO_CellularComponent\tGO_CellularComponent_Description\tGO_MolecularFunction\tGO_MolecularFunction_Description", outputFilename=outputDirectory + "/" + GetBasenameOfFilename(fileToAnnotate) + outputSuffix ) def CreateAnnotationFrom2FilesWith2CommonColumns( filenameToBeAnnotated=None, numberOfFirstLinesToIgnoreOfFileToBeAnnotated=0, key1ColumnOfFileToBeAnnotated=None, key1OfFileToBeAnnotatedApplyFunction=None, key2ColumnOfFileToBeAnnotated=None, key2OfFileToBeAnnotatedApplyFunction=None, annotationFilename=None, key1ColumnsOfAnnotationFile=None, key1ColumnsOfAnnotationFileApplyFunction=None, key2ColumnsOfAnnotationFile=None, key2ColumnsOfAnnotationFileApplyFunction=None, annotationColumnsOfAnnotationFile=None, outputHeader=None, outputFilename=None ): #Loading annotation file annotation = GetDictionaryOfManyTabularFilesWith2KeysAnd1ListOfValues( listOfFiles=[annotationFilename], key1Column=key1ColumnsOfAnnotationFile, key1ColumnApplyFunction=key1ColumnsOfAnnotationFileApplyFunction, key2Column=key2ColumnsOfAnnotationFile, key2ColumnApplyFunction=key2ColumnsOfAnnotationFileApplyFunction, valueColumnsComaSeparated=annotationColumnsOfAnnotationFile ) fileToBeAnnotated = open(filenameToBeAnnotated) outputFile = open(outputFilename, "w") linesOfFileToBeAnnotatedCounter = 0 while True: lineFileToBeAnnotated = fileToBeAnnotated.readline() if lineFileToBeAnnotated == "": break linesOfFileToBeAnnotatedCounter+=1 if linesOfFileToBeAnnotatedCounter == 1 and outputHeader != None: outputFile.write(outputHeader+"\n") if linesOfFileToBeAnnotatedCounter <= numberOfFirstLinesToIgnoreOfFileToBeAnnotated: continue lineSplittedFileToBeAnnotated = lineFileToBeAnnotated.replace("\n", "").split("\t") key1OfFileToBeAnnotated = lineSplittedFileToBeAnnotated[key1ColumnOfFileToBeAnnotated] if key1OfFileToBeAnnotatedApplyFunction != None: key1OfFileToBeAnnotated = key1OfFileToBeAnnotatedApplyFunction(key1OfFileToBeAnnotated) key2OfFileToBeAnnotated = lineSplittedFileToBeAnnotated[key2ColumnOfFileToBeAnnotated] if key2OfFileToBeAnnotatedApplyFunction != None: key2OfFileToBeAnnotated = key2OfFileToBeAnnotatedApplyFunction(key2OfFileToBeAnnotated) outputFile.write(key1OfFileToBeAnnotated + "\t" + key2OfFileToBeAnnotated) if annotation.has_key(key1OfFileToBeAnnotated): if annotation[key1OfFileToBeAnnotated].has_key(key2OfFileToBeAnnotated): outputFile.write("\t" + reduce(lambda x,y:x+"\t"+y , annotation[key1OfFileToBeAnnotated][key2OfFileToBeAnnotated])) outputFile.write("\n") outputFile.close() import bisect #from "3" --> 3 def positionStringToInteger(positionStr): return int(positionStr) def CreateGOAnnotationForFilesFromBioMartEnsemblFile( listOfFilesToAnnotate=None, numberOfFirstLinesToIgnore=0, chromosomeColumnOfFilesToAnnotate=None, chromosomeColumnOfFilesToAnnotateApplyFunction=None, positionColumnOfFilesToAnnotate=None, resolveDuplicateValuesFunction=None, fileWithGOAnnotation=None, fileWithGOAnnotationChromosomeColumn=None, fileWithGOAnnotationStartColumn=None, fileWithGOAnnotationEndColumn=None, columnsWithGOAnnotation=None, numberOfFirstLinesToIgnoreInGOAnnotationFile=0, outputFilename=None ): SNPs = GetDictionaryOfManyTabularFilesWith1KeyAnd1ListOfValues( listOfFiles=listOfFilesToAnnotate, numberOfFirstLinesToIgnore=numberOfFirstLinesToIgnore, firstCharacterOfLineComment="#", keyColumn=chromosomeColumnOfFilesToAnnotate, valueColumn=positionColumnOfFilesToAnnotate, resolveDuplicateValuesFunction=resolveDuplicateValuesFunction, applyFunctionToKey1=chromosomeColumnOfFilesToAnnotateApplyFunction, applyFunctionToValue=positionStringToInteger ) #Sort positions print "Sorting.." for k in SNPs: SNPs[k].sort() print "..Done" outputFile = open(outputFilename, "w") listOfColumnsWithGOAnnotation = [int (x) for x in columnsWithGOAnnotation.split(",")] GOFile = open(fileWithGOAnnotation) GOFileLineCounter = 0 notExistedChromosomes = [] while True: GOFileLine = GOFile.readline() if GOFileLine == "": break GOFileLineCounter += 1 if GOFileLineCounter % 10000 == 0: print "GO lines parsed: ", GOFileLineCounter #Ignore First lines if GOFileLineCounter <= numberOfFirstLinesToIgnoreInGOAnnotationFile: continue #Read values GOFileLineSplitted = GOFileLine.replace("\n", "").split("\t") GOChromosome = GOFileLineSplitted[fileWithGOAnnotationChromosomeColumn] GOStart = int(GOFileLineSplitted[fileWithGOAnnotationStartColumn]) GOEnd = int(GOFileLineSplitted[fileWithGOAnnotationEndColumn]) #Does GOChromosome exists in SNPs? if not SNPs.has_key(GOChromosome): if GOChromosome not in notExistedChromosomes: notExistedChromosomes += [GOChromosome] continue #print line for indexOfSNP in SNPs[GOChromosome][bisect.bisect_left(SNPs[GOChromosome], GOStart):bisect.bisect_right(SNPs[GOChromosome], GOEnd)]: outputLine = GOChromosome + "\t" + str(indexOfSNP) for GOAnnotation in listOfColumnsWithGOAnnotation: outputLine += "\t" + GOFileLineSplitted[GOAnnotation] #print outputLine outputFile.write(outputLine + "\n") GOFile.close() outputFile.close() if len(notExistedChromosomes) > 0: print "Warning the following Chromosomes were not found in the listOfFilesToBeAnnotated:" print reduce(lambda x,y:x+","+y, notExistedChromosomes) import os def GetBasenameOfFilename(filename=None): return os.path.basename(os.path.splitext(filename)[0]) def GetDictionaryOfManyTabularFilesWith1KeyAnd1ListOfValues( listOfFiles=None, numberOfFirstLinesToIgnore=0, firstCharacterOfLineComment="#", keyColumn=None, valueColumn=None, resolveDuplicateValuesFunction=None, applyFunctionToKey1=None, applyFunctionToValue=None ): toReturn = {} for file in listOfFiles: print file lines = open(file).readlines() LineCounter = 0 for line in lines: LineCounter += 1 #Ignore first lines if necessary if LineCounter <= numberOfFirstLinesToIgnore: continue lineSplitted = line.replace("\n", "").split("\t") if firstCharacterOfLineComment != None: if lineSplitted[0] != "": if lineSplitted[0][0] == firstCharacterOfLineComment: continue key1 = lineSplitted[keyColumn] if applyFunctionToKey1 != None: key1 = applyFunctionToKey1(key1) value = lineSplitted[valueColumn] if applyFunctionToValue != None: value = applyFunctionToValue(value) if toReturn.has_key(key1): if value in toReturn[key1]: if resolveDuplicateValuesFunction != None: toReturn[key1] = resolveDuplicateValuesFunction(value, toReturn[key1]) else: pass else: toReturn[key1] += [value] else: toReturn[key1] = [value] return toReturn def GetDictionaryOfManyTabularFilesWith2KeysAnd1ListOfValues( listOfFiles=None, numberOfFirstLinesToIgnore=0, firstCharacterOfLineComment="#", key1Column=None, key1ColumnApplyFunction=None, key2Column=None, key2ColumnApplyFunction=None, valueColumnsComaSeparated=None, resolveDuplicateValuesFunction=None): toReturn = {} valueColumns = [int(x) for x in valueColumnsComaSeparated.split(",")] for file in listOfFiles: print file lines = open(file).readlines() LineCounter = 0 for line in lines: LineCounter += 1 #Ignore first lines if necessary if LineCounter <= numberOfFirstLinesToIgnore: continue lineSplitted = line.replace("\n", "").split("\t") if firstCharacterOfLineComment != None: if lineSplitted[0][0] == firstCharacterOfLineComment: continue key1 = lineSplitted[key1Column] if key1ColumnApplyFunction != None: key1 = key1ColumnApplyFunction(key1) key2 = lineSplitted[key2Column] if key2ColumnApplyFunction != None: key2 = key2ColumnApplyFunction(key2) values = [lineSplitted[x] for x in valueColumns] if toReturn.has_key(key1): if toReturn[key1].has_key(key2): if values in toReturn[key1][key2]: if resolveDuplicateValuesFunction != None: toReturn[key1][key2] = resolveDuplicateValuesFunction(values, toReturn[key1][key2]) else: pass else: toReturn[key1][key2] = values else: toReturn[key1] = {} toReturn[key1][key2] = values return toReturn def GroupColumnsOfAFileAccordingToColumns( inputFilename=None, comaSeparatedColumnsToGroup=None, comaSeparatedKeyColumns=None, outputFilename=None, numberOfFirstLinesToIgnore=0): inputFile = open(inputFilename) outputFile = open(outputFilename, "w") groupColumns = [int(x) for x in comaSeparatedColumnsToGroup.split(",")] keyColumns = [int(x) for x in comaSeparatedKeyColumns.split(",")] previousLine = None previousLineSplitted = None common = {} for x in groupColumns: common[x] = [] lineCounter = 0 while True: line = inputFile.readline() lineCounter += 1 if lineCounter <= numberOfFirstLinesToIgnore: continue if lineCounter % 10000 == 0: print "Lines parsed: ", lineCounter lineSplitted = line.replace("\n", "").split("\t") if previousLine != None: sameAsPrevious = True if line != "": for keyColumn in keyColumns: if previousLineSplitted[keyColumn] != lineSplitted[keyColumn]: sameAsPrevious = False break if sameAsPrevious: for acolumn in groupColumns: if lineSplitted[acolumn] != "": common[acolumn] += [lineSplitted[acolumn]] if line == "" or not sameAsPrevious: lineToPrint = [''] * len(previousLineSplitted) for column in range(len(previousLineSplitted)): if column in groupColumns: if len(set(common[column])) > 0: lineToPrint[column] = reduce(lambda x,y: x+"|"+y, set(common[column])) else: lineToPrint[column] = previousLineSplitted[column] #print lineToPrint outputFile.write(reduce(lambda x,y:x+"\t"+y, lineToPrint) + "\n") for x in groupColumns: common[x] = [] previousLineSplitted = lineSplitted previousLine = line if line == "": break inputFile.close() outputFile.close() import shutil def Move(path_source=None, path_target=None): shutil.move(path_source, path_target) import tempfile def RequestTemporaryFilename(): temporaryFile = tempfile.NamedTemporaryFile() toReturn = temporaryFile.name temporaryFile.close() return toReturn import os def SortFileAccordingToColumnsByUsingUNIXSort( inputFilename=None, comaSeparatedKeyColumns=None, outputFilename=None ): keyColumns = [int(x)+1 for x in comaSeparatedKeyColumns.split(",")] keyColumns.reverse() temporaryFilename = RequestTemporaryFilename() tmpInput = inputFilename tmpOutput = temporaryFilename for column in keyColumns: command = "sort -k " + str(column) + " " + tmpInput + " > " + tmpOutput print command os.system(command) if tmpOutput == temporaryFilename: tmpInput = temporaryFilename tmpOutput = outputFilename else: tmpInput = outputFilename tmpOutput = temporaryFilename if tmpOutput == outputFilename: print "moving temporary file.." Move(temporaryFilename, outputFilename) print "done"