-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_formula.m
More file actions
261 lines (199 loc) · 8.15 KB
/
parse_formula.m
File metadata and controls
261 lines (199 loc) · 8.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
function r = parse_formula(varargin)
% PARSE_FORMULA Parses a chemical formula to form an atomic representation.
%
% SYNTAX
%
% r = parse_formula(str)
% r = parse_formula({str1,str2,str3, ...})
%
% Parses chemical formulas and returns a structure array holding the an
% atomic representation of the chemical forulas. The input is a string or
% a cell array of strings.
%
%
% EXAMPLES
%
% 1. Chemical formulas of varying complexity
%
% parse_formula('H2O'); % Water
% parse_formula('NaHCO3'); % Sodium Bicarbonate
% parse_formula('(CH4)8(H2O)46'); % Methane Clathrate
% parse_formula('CH3COOCH2CH3'); % Ethyl Acetate
% parse_formula('MnO4-'); % Negative Charge Ion
%
% parse_formula('dCH4'); % Returns error message
%
% 2. Create an structure array of atomic representations for a set of
% compounds
%
% r = parse_formula({'CH4','O2','CO2','H2O'});
%
%
% USAGE NOTES
%
% 1. Formulas are made of up of sequences of elements followed by
% integers indicating the number of included atoms. Omitted integers
% are assumed to be one.
%
% 2. Elements are the conventional one or two character abbreviations.
% The character is captialized. If present, the second character is
% lower case. In addition to the standard elements, the parser allows
% for
%
% Symbol Entity Interpretation
% e electron like an element with MW = 0
% D deuterium an element
% T tritium an element
% M any metal like an element, mw = NaN
% X any halogen like an element, mw = NaN
% Me methyl group (CH3) CH3 substituted for Me
% Et ethyl group (C2H5) C2H5 substituted for Et
% Bu butyl group (C4H9) C4H9 substituted for Bu
% Ph phenol group (C6H5) C6H5 substituted for Ph
%
% 3. Subgroups may be included between parenthesis or brackets followed
% by an integer indicating number of repetitions. Two levels of
% subgrouping are allowed.
%
% 4. A terminal lower case suffix denoting phases will be correctly
% parsed. The phase must be one of (aq), (l), (g), or (s).
%
% 5. The charge on an ionic species is appended as a + or - followed by
% an optional integer. Examples are H+, OH-, or Ca+2.
%
% 6. The bare electron e- is used in balancing chemical half reactions.
%
% 7. Error messages are generated for invalid fomulas
%
% 8. str can be a cell array of chemical formula. The results is a
% structure array. The elements of the output structure array are in
% one-to-one correspondence with elements of the cell array. For
% example
%
% r = parse_formula({'CH4','CH3OH','CHOOH'})
%
% r(1) holds the atomic formula for CH4, r(2) for CH3OH, and r(3) for
% CHOOH.
% AUTHOR
%
% Jeff Kantor
% December 18, 2010
assert(nargin > 0, 'parse_formula:input', ['No input. Expects a ', ...
'string or cell array of chemical formulas.']);
assert(nargin < 2, 'stoich:input', 'Unexpected extra inputs.');
switch class(varargin{1})
case 'char' % Single formula
str = varargin;
case 'cell' % Cell array of formulas
str = varargin{1};
otherwise
error('parse_formula:input',['requires cell array of ',...
'chemical formulas.']);
end
assert(iscellstr(str), 'parse_formula:input', ...
'Formulas must be strings.');
% Trim any whitespace at front or back
str = strtrim(str);
% Remove phase information. This information is currently neglected. In
% a later version we may wish to incorporate phase into a more complete
% data structure for representing chemical formula.
prex = '|\((aq|g|l|s)\)$';
str = regexprep(str,prex,'');
% Substitute for some common chemical abbreviations
str = regexprep(str,'Bu','C4H9'); % Butyl
str = regexprep(str,'Et','C2H5'); % Ethyl
str = regexprep(str,'Me','CH3'); % Methyl
str = regexprep(str,'Ph','C6H5'); % Phenol
% Apply the main parser to every element of str
q = cellfun(@(s)parse_formula_(s,3),str,'Uniform',false);
% Union of all atomic species
atoms = {};
for i = 1:length(q(:))
atoms = union(atoms, fields(q{i}));
end
% Add all atomic species to all structures.
for i = 1:length(q(:))
for j = 1:length(atoms)
if ~ismember(atoms{j},fields(q{i}))
q{i}.(atoms{j}) = 0;
end
end
end
% Form the structure array to have the same shape as str
r = reshape([q{:}],size(str));
end % parse_formula
function r = parse_formula_(str,kdepth)
assert(kdepth > 0, 'parse_formula_:Recursion', ...
'Reached maximum recursion depth');
r = struct([]);
% Regular expression returning tokens for element and number
% sexpr matches single elements followed by a digit, or a +/-
% followed by a digit to denote charge
persistent srex; % Regexp pattern to match elements and charges
persistent grex; % Regexp pattern to match groups
if isempty(srex) || isempty(grex)
srex = ['(A[lrsgutcm]|B[eraik]?|C[laroudsemf]?|D[y]?|E[urs]|', ...
'F[erm]?|G[aed]|H[eofgas]?|I[nr]?|Kr?|L[iaur]|', ...
'M[gnodt]?|N[eaibdpos]?|Os?|P[drmtboau]?|R[buhenaf]|', ...
'S[icernbmg]?|T[icebmalh]?|U|V|W|X[e]?|Yb?|Z[nr])', ...
'(\d*\.\d+|\d*)', ...
'|(e|+|-)(\d*)'];
grex = '|\(([^\)]*)\)(\d*\.\d+|\d*)|\[([^\]]*)\](\d*\.\d+|\d*)';
end
% Parse formula for chemical groups. This picks out anything that looks
% an element followed by a number, or a subgroup within parentheses.
% The tokens are returned in the cell array u. Each u{k} has two
% elements, the first is a string denoting the group, and the second is
% number string of repetitions.
[u,s,e] = regexp(str,[srex,grex],'tokens','start','end');
% Report any parsing errors. A parse error occurs if there are any
% characters not matched as tokens. We scan the start and end positions
% of the tokens to determine if there are any gaps.
g(1:length(str)) = '^';
for i = 1:length(s);
g(s(i):e(i)) = ' ';
end
assert(all(g ~= '^'), 'parse_formula:ParseError', ...
'Could not parse formula:\n %s\n %s\n', str, char(g));
% Extract atom tokens from the first part of each token
tok = cellfun(@(v)v{1},u,'Uni',false);
% Extract counts from the second part of each token, convert to
% doubles, empty counts set to 1
cnt = cellfun(@(v)v{2},u,'Uni',false);
cnt = str2double(cnt);
cnt(isnan(cnt)) = 1;
% Loop over tokens
for j = 1:length(u)
% See if token matches an element
if strcmp(tok{j},regexp(tok{j},srex,'match'))
% The token exactly matches an element.
% Change + or - tokens to 'Q'.
tok{j} = regexprep(tok{j},'+','Q');
if strcmp(tok{j}, '-')
tok{j} = 'Q';
cnt(j) = -cnt(j);
end
% Update atomic representation, adding a field if needed.
if isfield(r,tok{j})
r.(tok{j}) = r.(tok{j}) + cnt(j);
else
r(1).(tok{j}) = cnt(j);
end
else
% The token must be a group, so do a recursion to find
% an atomic represenation of the group.
q = parse_formula_(tok{j},kdepth-1);
% Updatethe atomic representation to include the group.
% Add fields if needed. Multiply by number of groups in the
% formula we're parsing.
f = fields(q);
for k = 1:length(f)
if isfield(r,f{k})
r.(f{k}) = r.(f{k}) + cnt(j)*q.(f{k});
else
r(1).(f{k}) = cnt(j)*q.(f{k});
end
end
end
end
end % parse_formula_