zen::Xml
Simple C++ XML Processing
 All Classes Namespaces Functions Variables Pages
parser.h
1 // *****************************************************************************
2 // * This file is part of the FreeFileSync project. It is distributed under *
3 // * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0 *
4 // * Copyright (C) Zenju (zenju AT freefilesync DOT org) - All Rights Reserved *
5 // *****************************************************************************
6 
7 #ifndef PARSER_H_81248670213764583021432
8 #define PARSER_H_81248670213764583021432
9 
10 #include <cstdio>
11 #include <cstddef> //ptrdiff_t; req. on Linux
12 #include <zen/string_tools.h>
13 #include "dom.h"
14 #include "error.h"
15 
16 
17 namespace zen
18 {
24 
31 std::string serialize(const XmlDoc& doc,
32  const std::string& lineBreak = "\r\n",
33  const std::string& indent = " "); //throw ()
34 
36 struct XmlParsingError : public XmlError
37 {
38  XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {}
40  const size_t row; //beginning with 0
42  const size_t col; //
43 };
44 
45 
47 
52 XmlDoc parse(const std::string& stream); //throw XmlParsingError
53 
54 
55 
56 
57 
58 
59 
60 
61 
62 
63 
64 
65 
66 
67 
68 
69 
70 
71 
72 
73 //---------------------------- implementation ----------------------------
74 //see: http://www.w3.org/TR/xml/
75 
76 namespace implementation
77 {
78 template <class Predicate> inline
79 std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a char, return true if value shall be encoded as hex
80 {
81  std::string output;
82  for (const char c : str)
83  {
84  if (c == '&') //
85  output += "&amp;";
86  else if (c == '<') //normalization mandatory: http://www.w3.org/TR/xml/#syntax
87  output += "&lt;";
88  else if (c == '>') //
89  output += "&gt;";
90  else if (pred(c))
91  {
92  if (c == '\'')
93  output += "&apos;";
94  else if (c == '\"')
95  output += "&quot;";
96  else
97  {
98  output += "&#x";
99  const auto hexDigits = hexify(c);
100  output += hexDigits.first;
101  output += hexDigits.second;
102  output += ';';
103  }
104  }
105  else
106  output += c;
107  }
108  return output;
109 }
110 
111 inline
112 std::string normalizeName(const std::string& str)
113 {
114  return normalize(str, [](char c) { return isWhiteSpace(c) || c == '=' || c == '/' || c == '\'' || c == '\"'; });
115 }
116 
117 inline
118 std::string normalizeElementValue(const std::string& str)
119 {
120  return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32; });
121 }
122 
123 inline
124 std::string normalizeAttribValue(const std::string& str)
125 {
126  return normalize(str, [](char c) { return static_cast<unsigned char>(c) < 32 || c == '\'' || c == '\"'; });
127 }
128 
129 
130 template <class CharIterator, size_t N> inline
131 bool checkEntity(CharIterator& first, CharIterator last, const char (&placeholder)[N])
132 {
133  assert(placeholder[N - 1] == 0);
134  const ptrdiff_t strLen = N - 1; //don't count null-terminator
135  if (last - first >= strLen && std::equal(first, first + strLen, placeholder))
136  {
137  first += strLen - 1;
138  return true;
139  }
140  return false;
141 }
142 
143 
144 namespace
145 {
146 std::string denormalize(const std::string& str)
147 {
148  std::string output;
149  for (auto it = str.begin(); it != str.end(); ++it)
150  {
151  const char c = *it;
152 
153  if (c == '&')
154  {
155  if (checkEntity(it, str.end(), "&amp;"))
156  output += '&';
157  else if (checkEntity(it, str.end(), "&lt;"))
158  output += '<';
159  else if (checkEntity(it, str.end(), "&gt;"))
160  output += '>';
161  else if (checkEntity(it, str.end(), "&apos;"))
162  output += '\'';
163  else if (checkEntity(it, str.end(), "&quot;"))
164  output += '\"';
165  else if (str.end() - it >= 6 &&
166  it[1] == '#' &&
167  it[2] == 'x' &&
168  it[5] == ';')
169  {
170  output += unhexify(it[3], it[4]);
171  it += 5;
172  }
173  else
174  output += c; //unexpected char!
175  }
176  else if (c == '\r') //map all end-of-line characters to \n http://www.w3.org/TR/xml/#sec-line-ends
177  {
178  auto itNext = it + 1;
179  if (itNext != str.end() && *itNext == '\n')
180  ++it;
181  output += '\n';
182  }
183  else
184  output += c;
185  }
186  return output;
187 }
188 
189 
190 void serialize(const XmlElement& element, std::string& stream,
191  const std::string& lineBreak,
192  const std::string& indent,
193  size_t indentLevel)
194 {
195  const std::string& nameFmt = normalizeName(element.getNameAs<std::string>());
196 
197  for (size_t i = 0; i < indentLevel; ++i)
198  stream += indent;
199 
200  stream += '<' + nameFmt;
201 
202  auto attr = element.getAttributes();
203  for (auto it = attr.first; it != attr.second; ++it)
204  stream += ' ' + normalizeName(it->first) + "=\"" + normalizeAttribValue(it->second) + '\"';
205 
206  //no support for mixed-mode content
207  auto iterPair = element.getChildren();
208  if (iterPair.first != iterPair.second) //structured element
209  {
210  stream += '>' + lineBreak;
211 
212  std::for_each(iterPair.first, iterPair.second,
213  [&](const XmlElement& el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); });
214 
215  for (size_t i = 0; i < indentLevel; ++i)
216  stream += indent;
217  stream += "</" + nameFmt + '>' + lineBreak;
218  }
219  else
220  {
221  std::string value;
222  element.getValue(value);
223 
224  if (!value.empty()) //value element
225  stream += '>' + normalizeElementValue(value) + "</" + nameFmt + '>' + lineBreak;
226  else //empty element
227  stream += "/>" + lineBreak;
228  }
229 }
230 
231 std::string serialize(const XmlDoc& doc,
232  const std::string& lineBreak,
233  const std::string& indent)
234 {
235  std::string version = doc.getVersionAs<std::string>();
236  if (!version.empty())
237  version = " version=\"" + normalizeAttribValue(version) + '\"';
238 
239  std::string encoding = doc.getEncodingAs<std::string>();
240  if (!encoding.empty())
241  encoding = " encoding=\"" + normalizeAttribValue(encoding) + '\"';
242 
243  std::string standalone = doc.getStandaloneAs<std::string>();
244  if (!standalone.empty())
245  standalone = " standalone=\"" + normalizeAttribValue(standalone) + '\"';
246 
247  std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak;
248  serialize(doc.root(), output, lineBreak, indent, 0);
249  return output;
250 }
251 }
252 }
253 
254 inline
255 std::string serialize(const XmlDoc& doc,
256  const std::string& lineBreak,
257  const std::string& indent) { return implementation::serialize(doc, lineBreak, indent); }
258 
259 /*
260 Grammar for XML parser
261 -------------------------------
262 document-expression:
263  <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
264  element-expression:
265 
266 element-expression:
267  <string attributes-expression/>
268  <string attributes-expression> pm-expression </string>
269 
270 element-list-expression:
271  <empty>
272  element-expression element-list-expression
273 
274 attributes-expression:
275  <empty>
276  string="string" attributes-expression
277 
278 pm-expression:
279  string
280  element-list-expression
281 */
282 
283 namespace implementation
284 {
285 struct Token
286 {
287  enum Type
288  {
289  TK_LESS,
290  TK_GREATER,
291  TK_LESS_SLASH,
292  TK_SLASH_GREATER,
293  TK_EQUAL,
294  TK_QUOTE,
295  TK_DECL_BEGIN,
296  TK_DECL_END,
297  TK_NAME,
298  TK_END
299  };
300 
301  Token(Type t) : type(t) {}
302  Token(const std::string& txt) : type(TK_NAME), name(txt) {}
303 
304  Type type;
305  std::string name; //filled if type == TK_NAME
306 };
307 
308 class Scanner
309 {
310 public:
311  Scanner(const std::string& stream) : stream_(stream), pos(stream_.begin())
312  {
313  if (zen::startsWith(stream_, BYTE_ORDER_MARK_UTF8))
314  pos += strLength(BYTE_ORDER_MARK_UTF8);
315  }
316 
317  Token nextToken() //throw XmlParsingError
318  {
319  //skip whitespace
320  pos = std::find_if(pos, stream_.end(), [](char c) { return !zen::isWhiteSpace(c); });
321 
322  if (pos == stream_.end())
323  return Token::TK_END;
324 
325  //skip XML comments
326  if (startsWith(xmlCommentBegin))
327  {
328  auto it = std::search(pos + xmlCommentBegin.size(), stream_.end(), xmlCommentEnd.begin(), xmlCommentEnd.end());
329  if (it != stream_.end())
330  {
331  pos = it + xmlCommentEnd.size();
332  return nextToken();
333  }
334  }
335 
336  for (auto it = tokens.begin(); it != tokens.end(); ++it)
337  if (startsWith(it->first))
338  {
339  pos += it->first.size();
340  return it->second;
341  }
342 
343  auto nameEnd = std::find_if(pos, stream_.end(), [](char c)
344  {
345  return c == '<' ||
346  c == '>' ||
347  c == '=' ||
348  c == '/' ||
349  c == '\'' ||
350  c == '\"' ||
351  zen::isWhiteSpace(c);
352  });
353 
354  if (nameEnd != pos)
355  {
356  std::string name(&*pos, nameEnd - pos);
357  pos = nameEnd;
358  return implementation::denormalize(name);
359  }
360 
361  //unknown token
362  throw XmlParsingError(posRow(), posCol());
363  }
364 
365  std::string extractElementValue()
366  {
367  auto it = std::find_if(pos, stream_.end(), [](char c)
368  {
369  return c == '<' ||
370  c == '>';
371  });
372  std::string output(pos, it);
373  pos = it;
374  return implementation::denormalize(output);
375  }
376 
377  std::string extractAttributeValue()
378  {
379  auto it = std::find_if(pos, stream_.end(), [](char c)
380  {
381  return c == '<' ||
382  c == '>' ||
383  c == '\'' ||
384  c == '\"';
385  });
386  std::string output(pos, it);
387  pos = it;
388  return implementation::denormalize(output);
389  }
390 
391  size_t posRow() const //current row beginning with 0
392  {
393  const size_t crSum = std::count(stream_.begin(), pos, '\r'); //carriage returns
394  const size_t nlSum = std::count(stream_.begin(), pos, '\n'); //new lines
395  assert(crSum == 0 || nlSum == 0 || crSum == nlSum);
396  return std::max(crSum, nlSum); //be compatible with Linux/Mac/Win
397  }
398 
399  size_t posCol() const //current col beginning with 0
400  {
401  //seek beginning of line
402  for (auto it = pos; it != stream_.begin(); )
403  {
404  --it;
405  if (*it == '\r' || *it == '\n')
406  return pos - it - 1;
407  }
408  return pos - stream_.begin();
409  }
410 
411 private:
412  Scanner (const Scanner&) = delete;
413  Scanner& operator=(const Scanner&) = delete;
414 
415  bool startsWith(const std::string& prefix) const
416  {
417  if (stream_.end() - pos < static_cast<ptrdiff_t>(prefix.size()))
418  return false;
419  return std::equal(prefix.begin(), prefix.end(), pos);
420  }
421 
422  using TokenList = std::vector<std::pair<std::string, Token::Type>>;
423  const TokenList tokens
424  {
425  { "<?xml", Token::TK_DECL_BEGIN },
426  { "?>", Token::TK_DECL_END },
427  { "</", Token::TK_LESS_SLASH },
428  { "/>", Token::TK_SLASH_GREATER },
429  { "<" , Token::TK_LESS }, //evaluate after TK_DECL_BEGIN!
430  { ">" , Token::TK_GREATER },
431  { "=" , Token::TK_EQUAL },
432  { "\"", Token::TK_QUOTE },
433  { "\'", Token::TK_QUOTE },
434  };
435 
436  const std::string xmlCommentBegin = "<!--";
437  const std::string xmlCommentEnd = "-->";
438 
439  const std::string stream_;
440  std::string::const_iterator pos;
441 };
442 
443 
444 class XmlParser
445 {
446 public:
447  XmlParser(const std::string& stream) :
448  scn(stream),
449  tk(scn.nextToken()) {}
450 
451  XmlDoc parse() //throw XmlParsingError
452  {
453  XmlDoc doc;
454 
455  //declaration (optional)
456  if (token().type == Token::TK_DECL_BEGIN)
457  {
458  nextToken();
459 
460  while (token().type == Token::TK_NAME)
461  {
462  std::string attribName = token().name;
463  nextToken();
464 
465  consumeToken(Token::TK_EQUAL);
466  expectToken(Token::TK_QUOTE);
467  std::string attribValue = scn.extractAttributeValue();
468  nextToken();
469 
470  consumeToken(Token::TK_QUOTE);
471 
472  if (attribName == "version")
473  doc.setVersion(attribValue);
474  else if (attribName == "encoding")
475  doc.setEncoding(attribValue);
476  else if (attribName == "standalone")
477  doc.setStandalone(attribValue);
478  }
479  consumeToken(Token::TK_DECL_END);
480  }
481 
482  XmlElement dummy;
483  parseChildElements(dummy);
484 
485  auto itPair = dummy.getChildren();
486  if (itPair.first != itPair.second)
487  doc.root().swapSubtree(*itPair.first);
488 
489  expectToken(Token::TK_END);
490  return doc;
491  }
492 
493 private:
494  XmlParser (const XmlParser&) = delete;
495  XmlParser& operator=(const XmlParser&) = delete;
496 
497  void parseChildElements(XmlElement& parent)
498  {
499  while (token().type == Token::TK_LESS)
500  {
501  nextToken();
502 
503  expectToken(Token::TK_NAME);
504  std::string elementName = token().name;
505  nextToken();
506 
507  XmlElement& newElement = parent.addChild(elementName);
508 
509  parseAttributes(newElement);
510 
511  if (token().type == Token::TK_SLASH_GREATER) //empty element
512  {
513  nextToken();
514  continue;
515  }
516 
517  expectToken(Token::TK_GREATER);
518  std::string elementValue = scn.extractElementValue();
519  nextToken();
520 
521  //no support for mixed-mode content
522  if (token().type == Token::TK_LESS) //structured element
523  parseChildElements(newElement);
524  else //value element
525  newElement.setValue(elementValue);
526 
527  consumeToken(Token::TK_LESS_SLASH);
528 
529  if (token().type != Token::TK_NAME ||
530  elementName != token().name)
531  throw XmlParsingError(scn.posRow(), scn.posCol());
532  nextToken();
533 
534  consumeToken(Token::TK_GREATER);
535  }
536  }
537 
538  void parseAttributes(XmlElement& element)
539  {
540  while (token().type == Token::TK_NAME)
541  {
542  std::string attribName = token().name;
543  nextToken();
544 
545  consumeToken(Token::TK_EQUAL);
546  expectToken(Token::TK_QUOTE);
547  std::string attribValue = scn.extractAttributeValue();
548  nextToken();
549 
550  consumeToken(Token::TK_QUOTE);
551  element.setAttribute(attribName, attribValue);
552  }
553  }
554 
555  const Token& token() const { return tk; }
556  void nextToken() { tk = scn.nextToken(); }
557 
558  void consumeToken(Token::Type t) //throw XmlParsingError
559  {
560  expectToken(t); //throw XmlParsingError
561  nextToken();
562  }
563 
564  void expectToken(Token::Type t) //throw XmlParsingError
565  {
566  if (token().type != t)
567  throw XmlParsingError(scn.posRow(), scn.posCol());
568  }
569 
570  Scanner scn;
571  Token tk;
572 };
573 }
574 
575 inline
576 XmlDoc parse(const std::string& stream) //throw XmlParsingError
577 {
578  return implementation::XmlParser(stream).parse(); //throw XmlParsingError
579 }
580 }
581 
582 #endif //PARSER_H_81248670213764583021432
XmlDoc parse(const std::string &stream)
Load XML document from a byte stream.
Definition: parser.h:576
std::string serialize(const XmlDoc &doc, const std::string &lineBreak="\r\n", const std::string &indent=" ")
Save XML document as a byte stream.
Definition: parser.h:255
The zen::Xml namespace.
Definition: bind.h:15
The complete XML document.
Definition: dom.h:246
const size_t row
Input file row where the parsing error occured (zero-based)
Definition: parser.h:40
Exception thrown due to an XML parsing error.
Definition: parser.h:36
Exception base class for zen::Xml.
Definition: error.h:13
const size_t col
Input file column where the parsing error occured (zero-based)
Definition: parser.h:42