convertSBMLToCobra

PURPOSE ^

convertSBMLToCobra Convert SBML format model (created using SBML Toolbox)

SYNOPSIS ^

function model = convertSBMLToCobra(modelSBML,defaultBound,compSymbolList,compNameList)

DESCRIPTION ^

convertSBMLToCobra Convert SBML format model (created using SBML Toolbox)
to Cobra format

 model = convertSBMLToCobra(modelSBML,defaultBound)

INPUTS
 modelSBML         SBML model structure

OPTIONAL INPUTS
 defaultBound      Maximum bound for model (Default = 1000)
 compSymbolList    List of compartment symbols
 compNameList      List of compartment names corresponding to compSymbolList

OUTPUT
 model             COBRA model structure
 Markus Herrgard 1/25/08

 Ines Thiele 01/27/2010 - I added new field to be read-in from SBML file
 if provided in file (e.g., references, comments, metabolite IDs, etc.)

 Richard Que 02/08/10 - Properly format reaction and metabolite fields
                        from SBML.

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SUBFUNCTIONS ^

SOURCE CODE ^

0001 function model = convertSBMLToCobra(modelSBML,defaultBound,compSymbolList,compNameList)
0002 %convertSBMLToCobra Convert SBML format model (created using SBML Toolbox)
0003 %to Cobra format
0004 %
0005 % model = convertSBMLToCobra(modelSBML,defaultBound)
0006 %
0007 %INPUTS
0008 % modelSBML         SBML model structure
0009 %
0010 %OPTIONAL INPUTS
0011 % defaultBound      Maximum bound for model (Default = 1000)
0012 % compSymbolList    List of compartment symbols
0013 % compNameList      List of compartment names corresponding to compSymbolList
0014 %
0015 %OUTPUT
0016 % model             COBRA model structure
0017 % Markus Herrgard 1/25/08
0018 %
0019 % Ines Thiele 01/27/2010 - I added new field to be read-in from SBML file
0020 % if provided in file (e.g., references, comments, metabolite IDs, etc.)
0021 %
0022 % Richard Que 02/08/10 - Properly format reaction and metabolite fields
0023 %                        from SBML.
0024 %
0025 
0026 if (nargin < 2)
0027     defaultBound = 1000;
0028 end
0029 
0030 if nargin < 3
0031     compSymbolList = {};
0032     compNameList = {};
0033 end
0034 
0035 nMetsTmp = length(modelSBML.species);
0036 nRxns = length(modelSBML.reaction);
0037 
0038 %% Construct initial metabolite list
0039 formulaCount = 0;
0040 speciesList = {};
0041 chargeList = [];
0042 metFormulas = {};
0043 haveFormulasFlag = false;
0044 tmpSpecies = [];
0045 for i = 1:nMetsTmp
0046     % Ignore boundary metabolites
0047     if (~modelSBML.species(i).boundaryCondition)
0048       %Check for the Palsson lab _b$ boundary condition indicator
0049       if (isempty(regexp(modelSBML.species(i).id,'_b$')));
0050         tmpSpecies = [ tmpSpecies  modelSBML.species(i)];
0051         speciesList{end+1} = modelSBML.species(i).id;
0052         notesField = modelSBML.species(i).notes;
0053         % Get formula if in notes field
0054         if (~isempty(notesField))
0055           [tmp,tmp,tmp,tmp,formula,tmp,tmp,tmp,tmp,charge] = parseSBMLNotesField(notesField);
0056           tmpCharge = charge;
0057           metFormulas {end+1} = formula;
0058           formulaCount = formulaCount + 1;
0059           haveFormulasFlag = true;
0060         end
0061         chargeList= [chargeList modelSBML.species(i).charge];
0062       end
0063     end
0064 end
0065 
0066 nMets = length(speciesList);
0067 
0068 %% Construct stoichiometric matrix and reaction list
0069 S = sparse(nMets,nRxns);
0070 rev = zeros(nRxns,1);
0071 lb = zeros(nRxns,1);
0072 ub = zeros(nRxns,1);
0073 c = zeros(nRxns,1);
0074 rxns = cell(nRxns,1);
0075 rules = cell(nRxns,1);
0076 genes = cell(nRxns,1);
0077 allGenes = {};
0078 h = waitbar(0,'Reading SBML file ...');
0079 hasNotesField = false;
0080 for i = 1:nRxns
0081     if mod(i,10) == 0
0082         waitbar(i/nRxns,h);
0083     end
0084     % Read the gpra from the notes field
0085     notesField = modelSBML.reaction(i).notes;
0086     if (~isempty(notesField))
0087         [geneList,rule,subSystem,grRule,formula,confidenceScore, citation, comment, ecNumber] = parseSBMLNotesField(notesField);
0088         subSystems{i} = subSystem;
0089         genes{i} = geneList;
0090         allGenes = [allGenes geneList];
0091         rules{i} = rule;
0092         grRules{i} = grRule;
0093         hasNotesField = true;
0094         confidenceScores{i}= confidenceScore;
0095         citations{i} = citation;
0096         comments{i} = comment;
0097         ecNumbers{i} = ecNumber;
0098     end
0099     rev(i) = modelSBML.reaction(i).reversible;
0100     rxnNameTmp = regexprep(modelSBML.reaction(i).name,'^R_','');
0101     rxnNames{i} = regexprep(rxnNameTmp,'_+',' ');
0102     rxnsTmp = regexprep(modelSBML.reaction(i).id,'^R_','');
0103     rxns{i} = cleanUpFormatting(rxnsTmp);
0104     % Construct S-matrix
0105     reactantStruct = modelSBML.reaction(i).reactant;
0106     for j = 1:length(reactantStruct)
0107         speciesID = find(strcmp(reactantStruct(j).species,speciesList));
0108         if (~isempty(speciesID))
0109             stoichCoeff = reactantStruct(j).stoichiometry;
0110             S(speciesID,i) = -stoichCoeff;
0111         end
0112     end
0113     productStruct = modelSBML.reaction(i).product;
0114     for j = 1:length(productStruct)
0115         speciesID = find(strcmp(productStruct(j).species,speciesList));
0116         if (~isempty(speciesID))
0117             stoichCoeff = productStruct(j).stoichiometry;
0118             S(speciesID,i) = stoichCoeff;
0119         end
0120     end
0121     if isfield(modelSBML.reaction(i).kineticLaw,'parameter')
0122         parameters = modelSBML.reaction(i).kineticLaw.parameter;
0123     else
0124         parameters =[];
0125     end
0126     if (~isempty(parameters))
0127         for j = 1:length(parameters)
0128             paramStruct = parameters(j);
0129             switch paramStruct.id
0130                 case 'LOWER_BOUND'
0131                     lb(i) = paramStruct.value;
0132                     if (lb(i) < -defaultBound)
0133                         lb(i) = -defaultBound;
0134                     end
0135                 case 'UPPER_BOUND'
0136                     ub(i) = paramStruct.value;
0137                     if (ub(i) > defaultBound)
0138                         ub(i) = defaultBound;
0139                     end
0140                 case 'OBJECTIVE_COEFFICIENT'
0141                     c(i) = paramStruct.value;
0142             end
0143         end
0144     else
0145         ub(i) = defaultBound;
0146         if (rev(i) == 1)
0147             lb(i) = -defaultBound;
0148         else
0149             lb(i) = 0;
0150         end
0151     end
0152 end
0153 %close the waitbar if this is matlab
0154 if (regexp(version, 'R20'))
0155     close(h);
0156 end
0157 allGenes = unique(allGenes);
0158 
0159 %% Construct gene to rxn mapping
0160 if (hasNotesField)
0161     
0162     rxnGeneMat = sparse(nRxns,length(allGenes));
0163     h = waitbar(0,'Constructing GPR mapping ...');
0164     for i = 1:nRxns
0165         if mod(i,10) == 0
0166             waitbar(i/nRxns,h);
0167         end
0168         if iscell(genes{i})
0169             [tmp,geneInd] = ismember(genes{i},allGenes);
0170         else
0171             [tmp,geneInd] = ismember(num2cell(genes{i}),allGenes);
0172         end
0173         
0174         rxnGeneMat(i,geneInd) = 1;
0175         for j = 1:length(geneInd)
0176             rules{i} = strrep(rules{i},['x(' num2str(j) ')'],['x(' num2str(geneInd(j)) '_TMP_)']);
0177         end
0178         rules{i} = strrep(rules{i},'_TMP_','');
0179     end
0180     %close the waitbar if this is matlab
0181     if (regexp(version, 'R20'))
0182         close(h);
0183     end
0184     
0185 end
0186 
0187 %% Construct metabolite list
0188 mets = cell(nMets,1);
0189 compartmentList = cell(length(modelSBML.compartment),1);
0190 if isempty(compSymbolList), useCompList = true; else useCompList = false; end
0191 for i=1:length(modelSBML.compartment)
0192     compartmentList{i} = modelSBML.compartment(i).id;
0193 end
0194 
0195 h = waitbar(0,'Constructing metabolite lists ...');
0196 hasAnnotationField = 0;
0197 for i = 1:nMets
0198     if mod(i,10) == 0
0199         waitbar(i/nMets,h);
0200     end
0201     % Parse metabolite id's
0202     % Get rid of the M_ in the beginning of metabolite id's
0203     metID = regexprep(speciesList{i},'^M_','');
0204     metID = regexprep(metID,'^_','');
0205     % Find compartment id
0206     tmpCell = {};
0207     if useCompList
0208         for j=1:length(compartmentList)
0209             tmpCell = regexp(metID,['_(' compartmentList{j} ')$'],'tokens');
0210             if ~isempty(tmpCell), break; end
0211         end
0212         if isempty(tmpCell), useCompList = false; end
0213     elseif ~isempty(compSymbolList)
0214         for j = 1: length(compSymbolList)
0215             tmpCell = regexp(metID,['_(' compSymbolList{j} ')$'],'tokens');
0216             if ~isempty(tmpCell), break; end
0217         end
0218     end
0219     if isempty(tmpCell), tmpCell = regexp(metID,'_(.)$','tokens'); end
0220     if ~isempty(tmpCell)
0221         compID = tmpCell{1};
0222         metTmp = [regexprep(metID,['_' compID{1} '$'],'') '[' compID{1} ']'];
0223     else
0224         metTmp = metID;
0225     end
0226     %Clean up met ID
0227     mets{i} = cleanUpFormatting(metTmp);
0228     % Parse metabolite names
0229     % Clean up some of the weird stuff in the sbml files
0230     metNamesTmp = regexprep(tmpSpecies(i).name,'^M_','');
0231     metNamesTmp = cleanUpFormatting(metNamesTmp);
0232     metNamesTmp = regexprep(metNamesTmp,'^_','');
0233 %     metNamesTmp = strrep(metNamesTmp,'_','-');
0234     metNamesTmp = regexprep(metNamesTmp,'-+','-');
0235     metNamesTmp = regexprep(metNamesTmp,'-$','');
0236     metNamesAlt{i} = metNamesTmp;
0237     % Separate formulas from names
0238     %[tmp,tmp,tmp,tmp,tokens] = regexp(metNamesTmp,'(.*)-((([A(Ag)(As)C(Ca)(Cd)(Cl)(Co)(Cu)F(Fe)H(Hg)IKLM(Mg)(Mn)N(Na)(Ni)OPRS(Se)UWXY(Zn)]?)(\d*)))*$');
0239     if (~haveFormulasFlag)
0240         [tmp,tmp,tmp,tmp,tokens] = regexp(metNamesTmp,'(.*)_((((A|Ag|As|C|Ca|Cd|Cl|Co|Cu|F|Fe|H|Hg|I|K|L|M|Mg|Mn|Mo|N|Na|Ni|O|P|R|S|Se|U|W|X|Y|Zn)?)(\d*)))*$');
0241         if (isempty(tokens))
0242             if length(metFormulas)<i||(metFormulas{i}=='')
0243                 metFormulas{i} = '';
0244             end
0245             metNames{i} = metNamesTmp;
0246         else
0247             formulaCount = formulaCount + 1;
0248             metFormulas{i} = tokens{1}{2};
0249             metNames{i} = tokens{1}{1};
0250         end
0251     else
0252         metNames{i} = metNamesTmp;
0253     end
0254     if isfield(modelSBML.species(i),'annotation')
0255         hasAnnotationField = 1;
0256         [metCHEBI,metKEGG,metPubChem,metInChI] = parseSBMLAnnotationField(modelSBML.species(i).annotation);
0257         metCHEBIID{i} = metCHEBI;
0258         metKEGGID{i} = metKEGG;
0259         metPubChemID{i} = metPubChem;
0260         metInChIString{i} = metInChI;
0261     end
0262 end
0263 if ( regexp( version, 'R20') )
0264     close(h);
0265 end
0266 
0267 %% Collect everything into a structure
0268 model.rxns = rxns;
0269 model.mets = mets;
0270 model.S = S;
0271 model.rev = rev;
0272 model.lb = lb;
0273 model.ub = ub;
0274 model.c = c;
0275 model.metCharge = transpose(chargeList);
0276 if (hasNotesField)
0277     model.rules = rules;
0278     model.genes = columnVector(allGenes);
0279     model.rxnGeneMat = rxnGeneMat;
0280     model.grRules = columnVector(grRules);
0281     model.subSystems = columnVector(subSystems);
0282     model.confidenceScores = columnVector(confidenceScores);
0283     model.rxnReferences = columnVector(citations);
0284     model.rxnECNumbers = columnVector(ecNumbers);
0285     model.rxnNotes = columnVector(comments);
0286 end
0287 model.rxnNames = columnVector(rxnNames);
0288 % Only include formulas if at least 90% of metabolites have them (otherwise
0289 % the "formulas" are probably just parts of metabolite names)
0290 if (formulaCount < 0.9*nMets)
0291     model.metNames = columnVector(metNamesAlt);
0292 else
0293     model.metNames = columnVector(metNames);
0294     model.metFormulas = columnVector(metFormulas);
0295 end
0296 if (hasAnnotationField)
0297     model.metChEBIID = columnVector(metCHEBIID);
0298     model.metKEGGID = columnVector(metKEGGID);
0299     model.metPubChemID = columnVector(metPubChemID);
0300     model.metInChIString = columnVector(metInChIString);
0301 end
0302 
0303 %%
0304 function [genes,rule,subSystem,grRule,formula,confidenceScore,citation,comment,ecNumber,charge] = parseSBMLNotesField(notesField)
0305 %parseSBMLNotesField Parse the notes field of an SBML file to extract
0306 %gene-rxn associations
0307 %
0308 % [genes,rule] = parseSBMLNotesField(notesField)
0309 %
0310 % Markus Herrgard 8/7/06
0311 % Ines Thiele 1/27/10 Added new fields
0312 % Handle different notes fields
0313 if isempty(regexp(notesField,'html:p', 'once'))
0314     tag = 'p';
0315 else
0316     tag = 'html:p';
0317 end
0318 
0319 subSystem = '';
0320 grRule = '';
0321 genes = {};
0322 rule = '';
0323 formula = '';
0324 confidenceScore = '';
0325 citation = '';
0326 ecNumber = '';
0327 comment = '';
0328 charge = [];
0329 Comment = 0;
0330 
0331 [tmp,fieldList] = regexp(notesField,['<' tag '>.*?</' tag '>'],'tokens','match');
0332 
0333 for i = 1:length(fieldList)
0334     fieldTmp = regexp(fieldList{i},['<' tag '>(.*)</' tag '>'],'tokens');
0335     fieldStr = fieldTmp{1}{1};
0336     if (regexp(fieldStr,'GENE_ASSOCIATION'))
0337         gprStr = regexprep(strrep(fieldStr,'GENE_ASSOCIATION:',''),'^(\s)+','');
0338         grRule = gprStr;
0339         [genes,rule] = parseBoolean(gprStr);
0340     elseif (regexp(fieldStr,'GENE ASSOCIATION'))
0341         gprStr = regexprep(strrep(fieldStr,'GENE ASSOCIATION:',''),'^(\s)+','');
0342         grRule = gprStr;
0343         [genes,rule] = parseBoolean(gprStr);
0344     elseif (regexp(fieldStr,'SUBSYSTEM'))
0345         subSystem = regexprep(strrep(fieldStr,'SUBSYSTEM:',''),'^(\s)+','');
0346         subSystem = strrep(subSystem,'S_','');
0347         subSystem = regexprep(subSystem,'_+',' ');
0348         if (isempty(subSystem))
0349             subSystem = 'Exchange';
0350         end
0351     elseif (regexp(fieldStr,'EC Number'))
0352         ecNumber = regexprep(strrep(fieldStr,'EC Number:',''),'^(\s)+','');
0353     elseif (regexp(fieldStr,'FORMULA'))
0354         formula = regexprep(strrep(fieldStr,'FORMULA:',''),'^(\s)+','');
0355     elseif (regexp(fieldStr,'CHARGE'))
0356         charge = str2num(regexprep(strrep(fieldStr,'CHARGE:',''),'^(\s)+',''));
0357     elseif (regexp(fieldStr,'AUTHORS'))
0358         if isempty(citation)
0359             citation = strcat(regexprep(strrep(fieldStr,'AUTHORS:',''),'^(\s)+',''));
0360         else
0361             citation = strcat(citation,';',regexprep(strrep(fieldStr,'AUTHORS:',''),'^(\s)+',''));
0362         end
0363     elseif Comment == 1 && isempty(regexp(fieldStr,'genes:', 'once'))
0364         Comment = 0;
0365         comment = fieldStr;
0366     elseif (regexp(fieldStr,'Confidence'))
0367         confidenceScore = regexprep(strrep(fieldStr,'Confidence Level:',''),'^(\s)+','');
0368         Comment = 1;
0369     end
0370 end
0371 %%
0372 function [metCHEBI,metKEGG,metPubChem,metInChI] = parseSBMLAnnotationField(annotationField)
0373 %parseSBMLAnnotationField Parse the annotation field of an SBML file to extract
0374 %metabolite information associations
0375 %
0376 % [genes,rule] = parseSBMLAnnotationField(annotationField)
0377 %
0378 % Ines Thiele 1/27/10 Added new fields
0379 % Handle different notes fields
0380 
0381 
0382 metPubChem = '';
0383 metCHEBI = '';
0384 metKEGG = '';
0385 metPubChem = '';
0386 metInChI='';
0387 
0388 [tmp,fieldList] = regexp(annotationField,'<rdf:li rdf:resource="urn:miriam:(\w+).*?"/>','tokens','match');
0389 
0390 %fieldTmp = regexp(fieldList{i},['<' tag '>(.*)</' tag '>'],'tokens');
0391 for i = 1:length(fieldList)
0392     fieldTmp = regexp(fieldList{i},['<rdf:li rdf:resource="urn:miriam:(.*)"/>'],'tokens');
0393     fieldStr = fieldTmp{1}{1};
0394     if (regexp(fieldStr,'obo.chebi'))
0395         metCHEBI = strrep(fieldStr,'obo.chebi:CHEBI%','');
0396     elseif (regexp(fieldStr,'kegg.compound'))
0397         metKEGG = strrep(fieldStr,'kegg.compound:','');
0398     elseif (regexp(fieldStr,'pubchem.substance'))
0399         metPubChem = strrep(fieldStr,'pubchem.substance:','');
0400     end
0401 end
0402 % get InChI string
0403 fieldTmp = regexp(annotationField,'<in:inchi xmlns:in="http://biomodels.net/inchi" metaid="(.*?)">(.*?)</in:inchi>','tokens');
0404 if ~isempty(fieldTmp)
0405 fieldStr = fieldTmp{1}{2};
0406 if (regexp(fieldStr,'InChI'))
0407     metInChI = strrep(fieldStr,'InChI=','');
0408 end
0409 end
0410 
0411 %% Cleanup Formatting
0412 function str = cleanUpFormatting(str)
0413 str = strrep(str,'-DASH-','-');
0414 str = strrep(str,'_DASH_','-');
0415 str = strrep(str,'_FSLASH_','/');
0416 str = strrep(str,'_BSLASH_','\');
0417 str = strrep(str,'_LPAREN_','(');
0418 str = strrep(str,'_LSQBKT_','[');
0419 str = strrep(str,'_RSQBKT_',']');
0420 str = strrep(str,'_RPAREN_',')');
0421 str = strrep(str,'_COMMA_',',');
0422 str = strrep(str,'_PERIOD_','.');
0423 str = strrep(str,'_APOS_','''');
0424 str = regexprep(str,'_e_$','(e)');
0425 str = regexprep(str,'_e$','(e)');
0426 str = strrep(str,'&amp;','&');
0427 str = strrep(str,'&lt;','<');
0428 str = strrep(str,'&gt;','>');
0429 str = strrep(str,'&quot;','"');

Generated on Thu 21-Jun-2012 15:39:23 by m2html © 2003