roxen.lists.pike.general

Subject Author Date
RFC: New Parser class for parsing structured datafiles (non-XML) Stephen R. van den Berg <srb[at]cuci[dot]nl> 28-01-2009
I've revamped some datafileparsing routines I've been using for years,
and made them more robust/generic in the process.
I'd like to include this class into mainstream Pike (7.8 or 7.9) eventually,
as Parser.Structure (for lack of a better name) perhaps.
The sample records here are:
a. Some bank statements in CSV format.
b. Some bank statements in MT940 format.

Any comments with regard to the provided interface, the datastructure
description or the usefulness, or perhaps the superfluousness because
this can already be done by Parser.LR
and I'm just being silly for not reusing that?

With regard to documentation, that will be improved, I know it's a bit
scarce at the moment ;-).

Sample input:

---------------------cut here-------------------------
1234563,"20081128","IC",00000,8689839,"KN:
ARNL-1034758710-","",19.90,"A","M","BETREFT:25399280                ACCESS
INTERNET BV","EUR"
2348165,"20081128","DV",00000,0,"DEBETRENTE
GIROKWARTAALKREDIET","",10.04,"A","M","PER. 21/10-20/11 (1,15% PER MND)","EUR"
2341864,"20081128","DV",00000,0,"DEBETRENTE 1,39% PER
MAAND,","",0.35,"A","M","PERIODE 21/10 - 20/11","EUR"

0000 01PSTBNL21XXXX00001
0000 01PSTBNL21XXXX00001
940 00
:20:MPBZ
:25:0003890413
:28C:000
:60F:C071205EUR2085,93
:61:071206C11,90NGT NONREF
:86:0001239968 Hr X Colijn LANDGRAAF
Hr X Colijn LANDGRAAF 1700.163221231474
TRANSACTIEDATUM: 06-12-2007
:61:071206D109,10NIC 004952732340412
:86:0002243510 KN: 004952739440412
049-8223343 BETREFT FACTUUR D.D. 28-11-2007
INCL. 17,42 BTW KPN Telefonie
:61:071206D34,95NDV 4700134832333684
:86:0998234454 KN: 4702162821344484
4700124821134444 R.2003-13224 K.122368
PEDESCO/KLONSSEN.ORG
:62F:C081119EUR289,33
:86:D000004C000009D9961,90C60315,92
-XXX
---------------------cut here-------------------------

Sample output (corresponds with the above input):
---------------------cut here-------------------------
Found record: ([ /* 1 element */
  "csv": ([ /* 1 element */
      "gtz": ({ /* 3 elements */
            ([ /* 11 elements */
              "afbij": "A",
              "amount": "19.90",
              "bankno": "8689839",
              "mutatie": "M",
              "mutatiesoort": "IC",
              "mybankno": "1234563",
              "name": "KN: ARNL-1034758710-",
              "reference": "BETREFT:25399280                ACCESS INTERNET BV",
              "transferdate": "20081128",
              "valutacode": "EUR",
              "volgnummer": "00000"
            ]),
            ([ /* 11 elements */
              "afbij": "A",
              "amount": "10.04",
              "bankno": "0",
              "mutatie": "M",
              "mutatiesoort": "DV",
              "mybankno": "2348165",
              "name": "DEBETRENTE GIROKWARTAALKREDIET",
              "reference": "PER. 21/10-20/11 (1,15% PER MND)",
              "transferdate": "20081128",
              "valutacode": "EUR",
              "volgnummer": "00000"
            ]),
            ([ /* 11 elements */
              "afbij": "A",
              "amount": "0.35",
              "bankno": "0",
              "mutatie": "M",
              "mutatiesoort": "DV",
              "mybankno": "2341864",
              "name": "DEBETRENTE 1,39% PER MAAND,",
              "reference": "PERIODE 21/10 - 20/11",
              "transferdate": "20081128",
              "valutacode": "EUR",
              "volgnummer": "00000"
            ])
        })
    ])
])
Found record: ([ /* 1 element */
  "mt940": ([ /* 11 elements */
      "accountno": "0003890413",
      "closingbalance": ([ /* 4 elements */
          "amount": "289,33",
          "creditdebit": "C",
          "currency": "EUR",
          "date": "081119"
        ]),
      "creditamount": "60315,92",
      "creditentries": "000009",
      "debitamount": "9961,90",
      "debitentries": "000004",
      "messageheader1": ({ /* 2 elements */
            ([ /* 2 elements */
              "exportaddress": "PSTBNL21XXXX",
              "exportnumber": "00001"
            ]),
            ([ /* 2 elements */
              "exportaddress": "PSTBNL21XXXX",
              "exportnumber": "00001"
            ])
        }),
      "messagetrailer": ([ /* 2 elements */
          "end": "XXX",
          "start": "-"
        ]),
      "openingbalance": ([ /* 4 elements */
          "amount": "2085,93",
          "creditdebit": "C",
          "currency": "EUR",
          "date": "071205"
        ]),
      "reference": "MPBZ",
      "statements": ({ /* 3 elements */
            ([ /* 6 elements */
              "amount": "11,90",
              "creditdebit": "C",
              "description": "0001239968 Hr X Colijn LANDGRAAF\n"
                "Hr X Colijn LANDGRAAF 1700.163221231474\n"
                "TRANSACTIEDATUM: 06-12-2007",
              "paymentreference": "NONREF",
              "transactiontype": "GT ",
              "valuedate": "071206"
            ]),
            ([ /* 6 elements */
              "amount": "109,10",
              "creditdebit": "D",
              "description": "0002243510 KN: 004952739440412\n"
                "049-8223343 BETREFT FACTUUR D.D. 28-11-2007\n"
                "INCL. 17,42 BTW KPN Telefonie",
              "paymentreference": "004952732340412",
              "transactiontype": "IC ",
              "valuedate": "071206"
            ]),
            ([ /* 6 elements */
              "amount": "34,95",
              "creditdebit": "D",
              "description": "0998234454 KN: 4702162821344484\n"
                "4700124821134444 R.2003-13224 K.122368\n"
                "PEDESCO/KLONSSEN.ORG",
              "paymentreference": "4700134832333684",
              "transactiontype": "DV ",
              "valuedate": "071206"
            ])
        })
    ])
])
---------------------cut here-------------------------

Sample code to actually produce the output:

  object rf=parsestruct(allformats,Stdio.stdin,1);
   { mapping m;
     while(m=rf->fetch())
        write("Found record: %O\n",m);
   }

Sample format description needed for parsing the above input:

array|mapping allformats=
([
 "csv":
 ({
  ({"gtz",
   (),
   (),
   (),
   (),
   (),
   (),
   (),
   (),
   (),
   (),
   (),
   (),
  }),
 }),
 "mt940":
 ({
  ({"messageheader1",
   (),
   ({"CS1"," ",(["drop":1])}),
   (),
   (),
   (),
  }),
  ({"messageheader3",(["fold":1]),
   (),
   ({"CS1"," ",(["drop":1])}),
   (),
  }),
  ({"TRN",(["fold":1]),
   (),
   (),
  }),
  ({"accountid",(["fold":1]),
   (),
   (),
  }),
  ({"statementno",(["fold":1]),
   (),
   (),
  }),
  ({"openingbalance",
   (),
   (),
   (),
   (),
   (),
  }),
  (["statements":
   ({
    ({"statementline",(["fold":1]),
     (),
     (),
     (),
     (),
     (),
     (),    // 3 for Postbank, 4 for ING
     (),
    }),
    ({"informationtoaccountowner",(["fold":1]),
     (),
     (),
    }),
   }),
  ]),
  ({"closingbalance",(["mandatory":1]),
   (),
   (),
   (),
   (),
   (),
  }),
  ({"informationtoaccountowner",(["fold":1]),
   (),
   (),
   (),
   (),
   (),
   (),
   (),
   (),
   (),
   (),
  }),
  ({"messagetrailer",(["mandatory":1]),
   (),
   (),
  }),
 }),
]);

Implementation of the parsestruct class which actually does the
parsing according to the specs above:

class parsestruct
{ protected Stdio.FILE in;
  protected mapping|array fms;
  protected array(string) alread=({});
  protected int eol;
  protected int maxregexmatch=1024;
  protected Regexp simple=Regexp("^[^[\](){}<>^$|+*?\]+$");
  protected Regexp emptyline=Regexp("^[ \t\v\r\x1a]*$");
  protected mixed severity=1;
  protected int verb=0;
  protected int recordcount=0;

  // verbose is the number of characters to display per progressline
  protected void
   create(void|array|mapping formats,void|string|Stdio.File|Stdio.FILE input,
    void|int verbose)
   { fms=formats; verb=verbose==1?70:verbose;
     if(!input)
        input="";
     if(stringp(input))
        input=Stdio.FakeFile(input);
     if(!in->unread)
        (in=Stdio.FILE())->assign(input);
   }

  private string read(int n)
   { string s;
     s=in->read(n);
     alread+=();
     if(sizeof(s)!=n)
        throw(severity);
     return s;
   }

  private string gets(int n)
   { string s;
     if(n)
      { s=read(n);
        if(!s||sizeof(s)!=n||has_value(s,"\n"))
	   throw(severity);
      }
     else
      { s=in->gets();
        if(!s)
	   throw(severity);
        alread+=();
        if(has_suffix(s,"\r"))
	  s=s[..<1];
        eol=1;
      }
     return s;
   }

  protected string getcsvfield(multiset delims)
   { int leadspace=1,inquotes=0;
     string c;
     array word=({});
     for(c=read(1);c;)
      { if(delims[c])
         { if(!inquotes)
              return word*"";
           word+=();
	 }
        else switch(c)
         { case "\"":leadspace=0;
              if(!inquotes)
                 inquotes=1;
              else if((c=read(1))=="\"")
                 word+=();
              else
               { inquotes=0;
	         continue;
	       }
              break;
           default:leadspace=0;
           case " ":case "\t":
              if(!leadspace)
                 word+=();
              break;
           case "\n":
              if(!inquotes)
	       { eol=1;
                 return word*"";
	       }
              word+=();
           case "\r":case "\x1a":;
         }
        c=read(1);
      }
     throw(severity);
   }

  //  "fname",   string|regex = regexp match
  //             int|width = field width - 0: till end of line
  //             multiset|delim = delimiters
  //             mapping extra params
  //             array = nested definition/subfields
  //             mandatory = 1
  //             fold = 1 : makes the record fold its fields into the parent

  protected mapping getrecord(array fmt,int found)
   { mapping ret=([]),options;
     string recname=fmt[0];
     if(mappingp(fmt[1]))
      { options=fmt[1];fmt=fmt[2..];
#if 0 // srb FIXME
        if(options->mandatory)
	   severity=sprintf("Missing mandatory record %s",recname);
#endif
      }
     else
        options=([]),fmt=fmt[1..];
     eol=0;
     foreach(fmt;;array field)
      { string fname=field[0];
        mixed m=field[1];
        string|array|mapping value;
        if(!mappingp(m))
	 { if(arrayp(m))
	    { value=getrecord(m,found);
	      m=([]);
	    }
	   else
 	     m=([(intp(m)?"width":(stringp(m)?"regex":"delim")):m]);
	   if(sizeof(field)>2)
	     m+=field[2];
	 }
#if 0 // srb FIXME
        if(!found && m->mandatory)
	   severity=sprintf("Missing mandatory field %s",recname);
#endif
        if(eol)
	   throw(severity);
        if(!zero_type(m->width))
           value=gets(m->width);
        if(m->delim)
	 { multiset delim=m->delim;
	   if(sizeof(delim-(<",",";","\t"," ">)))
	    { for(value=({});;)
	       { string c;
	         if(!(c=read(1)))
		    throw(severity);
	         if(delim[c])
		    break;
	         switch(c)
		  { case "\n":
		       throw(severity);
		    default:
		       value+=();
	 	    case "\r":case "\x1a":;
		  }
	       }
	      value*="";
	    }
	   else
	      value=getcsvfield(delim);
	 }
        if(m->regex)
	 { Regexp rgx;
	   if(stringp(m->regex))
	    { if(!value && simple->match(m->regex))
	       { m->width=sizeof(m->regex);
                 value=gets(m->width);
	       }
	      m->regex=Regexp("^("+m->regex+")"+(value?"$":""));
	    }
	   rgx=m->regex;
	   if(value)
	    { if(!rgx->match(value))
	         throw(severity);
	    }
	   else
	    { string buf=in->read(m->maxregexmatch || maxregexmatch);
	      if(!buf || !(value=rgx->split(buf)))
	       { alread+=();
	         throw(severity);
	       }
	      in->unread(buf[sizeof(value=value[0])..]);
	      value-="\r";
	      if(has_suffix(value,"\n"))
	         value=value[..<1];
	    }
	 }
        if(!m->drop)
           ret[fname]=value;
      }
     if(!eol && gets(0)!="")
        throw(severity);
     severity=1;
     recordcount++;
     if(verb)
      { array s=();
        foreach(ret;string name;string value)
	   if(sizeof(value))
	      s+=();
        werror("%d %.*s\r",recordcount,verb,
         replace(s[..<1]*"",({"\n","  ","   "}),({""," "," "})));
      }
     return options->fold?ret:([recname:ret]);
   }

  private void add2map(mapping res,string name,mixed entry)
   { mapping|array tm = res[name];
     if(tm)
      { if(mappingp(tm))
           tm=();
        else
           tm+=();
        res[name]=tm;
      }
     else
        res[name]=entry;
   }

  parsestruct feed(string content)
   { in->unread(content);
     return this;
   }

  int skipemptylines()
   { string line; int eof=1;
     while((line=in->gets()) && emptyline->match(line));
     if(line)
        eof=0,in->unread(line+"\n");
     return eof;
   }

  mapping fetch(void|array|mapping formats)
   { mapping ret=([]);
     int skipempty=0;
     if(!formats)
      { if(skipemptylines())
	   return UNDEFINED;
        skipempty=1;formats=fms;
      }
     if(arrayp(formats))
      { foreach(formats;;array|mapping fmt)
           if(arrayp(fmt))
            { array alreadold=alread;
              for(int found=0;;found=1)
               { alread=({});
                 mixed err=catch
                  { mapping rec=getrecord(fmt,found);
		    foreach(rec;string name;mixed value)
                       add2map(ret,name,value);
                    alreadold+=alread;
                    continue;
                  };
	         severity=1;
                 if(err!=1)
                    throw(err);
                 in->unread(alread*"");
                 break;
               }
	      alread=alreadold;
	    }
           else if(fmt=fetch(fmt))
	      ret+=fmt;
        if(skipempty)
	   skipemptylines();
      }
     else
        for(int found=1;found;found=0)
         { foreach(formats;string name;array|mapping subfmt)
              for(;;)
	       { mapping m=fetch(subfmt);
	         if(m)
	          { found=1;add2map(ret,name,m);
		    continue;
		  }
		 break;
	       }
           if(skipempty && skipemptylines())
	      break;
	 }
     return sizeof(ret) && ret;
   }
}
-- 
Sincerely,
           Stephen R. van den Berg.

"The difficult we do today; the impossible takes a little longer."