]> git.sur5r.net Git - glabels/blob - src/merge-text.c
Reference glabels.org website
[glabels] / src / merge-text.c
1 /*
2  *  merge-text.c
3  *  Copyright (C) 2001-2009  Jim Evins <evins@snaught.com>.
4  *
5  *  This file is part of gLabels.
6  *
7  *  gLabels is free software: you can redistribute it and/or modify
8  *  it under the terms of the GNU General Public License as published by
9  *  the Free Software Foundation, either version 3 of the License, or
10  *  (at your option) any later version.
11  *
12  *  gLabels is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU General Public License for more details.
16  *
17  *  You should have received a copy of the GNU General Public License
18  *  along with gLabels.  If not, see <http://www.gnu.org/licenses/>.
19  */
20
21 #include <config.h>
22
23 #include "merge-text.h"
24
25 #include <stdio.h>
26
27 #include "debug.h"
28
29 #define LINE_BUF_LEN 1024
30
31
32 /*===========================================*/
33 /* Private types                             */
34 /*===========================================*/
35
36 struct _glMergeTextPrivate {
37
38         gchar             delim;
39         gboolean          line1_has_keys;
40
41         FILE             *fp;
42
43         GPtrArray        *keys;
44         gint              n_fields_max;
45 };
46
47 enum {
48         LAST_SIGNAL
49 };
50
51 enum {
52         ARG_0,
53         ARG_DELIM,
54         ARG_LINE1_HAS_KEYS,
55 };
56
57
58 /*===========================================*/
59 /* Private globals                           */
60 /*===========================================*/
61
62
63 /*===========================================*/
64 /* Local function prototypes                 */
65 /*===========================================*/
66
67 static void           gl_merge_text_finalize        (GObject          *object);
68
69 static void           gl_merge_text_set_property    (GObject          *object,
70                                                      guint             param_id,
71                                                      const GValue     *value,
72                                                      GParamSpec       *pspec);
73
74 static void           gl_merge_text_get_property    (GObject          *object,
75                                                      guint             param_id,
76                                                      GValue           *value,
77                                                      GParamSpec       *pspec);
78
79 static gchar         *key_from_index                (glMergeText      *merge_text,
80                                                      gint              i_field);
81 static void           clear_keys                    (glMergeText      *merge_text);
82
83 static GList         *gl_merge_text_get_key_list    (glMerge          *merge);
84 static gchar         *gl_merge_text_get_primary_key (glMerge          *merge);
85 static void           gl_merge_text_open            (glMerge          *merge);
86 static void           gl_merge_text_close           (glMerge          *merge);
87 static glMergeRecord *gl_merge_text_get_record      (glMerge          *merge);
88 static void           gl_merge_text_copy            (glMerge          *dst_merge,
89                                                      glMerge          *src_merge);
90
91 static GList         *parse_line                    (FILE             *fp,
92                                                      gchar             delim);
93 static gchar         *parse_field                   (gchar            *raw_field);
94 static void           free_fields                   (GList           **fields);
95
96
97
98 /*****************************************************************************/
99 /* Boilerplate object stuff.                                                 */
100 /*****************************************************************************/
101 G_DEFINE_TYPE (glMergeText, gl_merge_text, GL_TYPE_MERGE);
102
103
104 static void
105 gl_merge_text_class_init (glMergeTextClass *class)
106 {
107         GObjectClass *object_class = G_OBJECT_CLASS (class);
108         glMergeClass *merge_class  = GL_MERGE_CLASS (class);
109
110         gl_debug (DEBUG_MERGE, "START");
111
112         gl_merge_text_parent_class = g_type_class_peek_parent (class);
113
114         object_class->set_property = gl_merge_text_set_property;
115         object_class->get_property = gl_merge_text_get_property;
116
117         g_object_class_install_property
118                 (object_class,
119                  ARG_DELIM,
120                  g_param_spec_char ("delim", NULL, NULL,
121                                     0, 0x7F, ',',
122                                     (G_PARAM_READABLE | G_PARAM_WRITABLE)));
123
124         g_object_class_install_property
125                 (object_class,
126                  ARG_LINE1_HAS_KEYS,
127                  g_param_spec_boolean ("line1_has_keys", NULL, NULL,
128                                        FALSE,
129                                        (G_PARAM_READABLE | G_PARAM_WRITABLE)));
130
131         object_class->finalize = gl_merge_text_finalize;
132
133         merge_class->get_key_list    = gl_merge_text_get_key_list;
134         merge_class->get_primary_key = gl_merge_text_get_primary_key;
135         merge_class->open            = gl_merge_text_open;
136         merge_class->close           = gl_merge_text_close;
137         merge_class->get_record      = gl_merge_text_get_record;
138         merge_class->copy            = gl_merge_text_copy;
139
140         gl_debug (DEBUG_MERGE, "END");
141 }
142
143
144 static void
145 gl_merge_text_init (glMergeText *merge_text)
146 {
147         gl_debug (DEBUG_MERGE, "START");
148
149         merge_text->priv = g_new0 (glMergeTextPrivate, 1);
150
151         merge_text->priv->keys = g_ptr_array_new ();
152
153         gl_debug (DEBUG_MERGE, "END");
154 }
155
156
157 static void
158 gl_merge_text_finalize (GObject *object)
159 {
160         glMergeText *merge_text = GL_MERGE_TEXT (object);
161         gint         i;
162
163         gl_debug (DEBUG_MERGE, "START");
164
165         g_return_if_fail (object && GL_IS_MERGE_TEXT (object));
166
167         clear_keys (merge_text);
168         g_ptr_array_free (merge_text->priv->keys, TRUE);
169         g_free (merge_text->priv);
170
171         G_OBJECT_CLASS (gl_merge_text_parent_class)->finalize (object);
172
173         gl_debug (DEBUG_MERGE, "END");
174 }
175
176
177 /*--------------------------------------------------------------------------*/
178 /* Set argument.                                                            */
179 /*--------------------------------------------------------------------------*/
180 static void
181 gl_merge_text_set_property (GObject      *object,
182                             guint         param_id,
183                             const GValue *value,
184                             GParamSpec   *pspec)
185 {
186         glMergeText *merge_text;
187
188         merge_text = GL_MERGE_TEXT (object);
189
190         switch (param_id) {
191
192         case ARG_DELIM:
193                 merge_text->priv->delim = g_value_get_char (value);
194                 gl_debug (DEBUG_MERGE, "ARG \"delim\" = \"%c\"",
195                           merge_text->priv->delim);
196                 break;
197
198         case ARG_LINE1_HAS_KEYS:
199                 merge_text->priv->line1_has_keys = g_value_get_boolean (value);
200                 gl_debug (DEBUG_MERGE, "ARG \"line1_has_keys\" = \"%d\"",
201                           merge_text->priv->line1_has_keys);
202                 break;
203
204         default:
205                 G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
206                 break;
207
208         }
209
210 }
211
212
213 /*--------------------------------------------------------------------------*/
214 /* Get argument.                                                            */
215 /*--------------------------------------------------------------------------*/
216 static void
217 gl_merge_text_get_property (GObject     *object,
218                             guint        param_id,
219                             GValue      *value,
220                             GParamSpec  *pspec)
221 {
222         glMergeText *merge_text;
223
224         merge_text = GL_MERGE_TEXT (object);
225
226         switch (param_id) {
227
228         case ARG_DELIM:
229                 g_value_set_char (value, merge_text->priv->delim);
230                 break;
231
232         case ARG_LINE1_HAS_KEYS:
233                 g_value_set_boolean (value, merge_text->priv->line1_has_keys);
234                 break;
235
236         default:
237                 G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
238                 break;
239
240         }
241
242 }
243
244
245 /*---------------------------------------------------------------------------*/
246 /* Lookup key name from zero based index.                                    */
247 /*---------------------------------------------------------------------------*/
248 static gchar *
249 key_from_index (glMergeText  *merge_text,
250                 gint          i_field)
251 {
252         if ( merge_text->priv->line1_has_keys &&
253              (i_field < merge_text->priv->keys->len) )
254         {
255                 return g_strdup (g_ptr_array_index (merge_text->priv->keys, i_field));
256         }
257         else
258         {
259                 return g_strdup_printf ("%d", i_field+1);
260         }
261 }
262
263
264 /*---------------------------------------------------------------------------*/
265 /* Clear stored keys.                                                        */
266 /*---------------------------------------------------------------------------*/
267 static void
268 clear_keys (glMergeText      *merge_text)
269 {
270         gint i;
271
272         for ( i = 0; i < merge_text->priv->keys->len; i++ )
273         {
274                 g_free (g_ptr_array_index (merge_text->priv->keys, i));
275         }
276         merge_text->priv->keys->len = 0;
277 }
278
279
280 /*--------------------------------------------------------------------------*/
281 /* Get key list.                                                            */
282 /*--------------------------------------------------------------------------*/
283 static GList *
284 gl_merge_text_get_key_list (glMerge *merge)
285 {
286         glMergeText   *merge_text;
287         GList         *record_list, *p_rec;
288         glMergeRecord *record;
289         GList         *p_field;
290         gint           i_field, n_fields_line, n_fields;
291         GList         *key_list;
292         
293         gl_debug (DEBUG_MERGE, "BEGIN");
294
295         merge_text = GL_MERGE_TEXT (merge);
296
297         if ( merge_text->priv->line1_has_keys )
298         {
299                 n_fields = merge_text->priv->keys->len;
300         }
301         else
302         {
303                 n_fields = merge_text->priv->n_fields_max;
304         }
305
306         key_list = NULL;
307         for ( i_field=0; i_field < n_fields; i_field++ )
308         {
309                 key_list = g_list_append (key_list, key_from_index(merge_text, i_field));
310         }
311
312         gl_debug (DEBUG_MERGE, "END");
313
314         return key_list;
315 }
316
317
318 /*--------------------------------------------------------------------------*/
319 /* Get "primary" key.                                                       */
320 /*--------------------------------------------------------------------------*/
321 static gchar *
322 gl_merge_text_get_primary_key (glMerge *merge)
323 {
324         /* For now, let's always assume the first column is the primary key. */
325         return key_from_index (GL_MERGE_TEXT (merge), 0);
326 }
327
328
329 /*--------------------------------------------------------------------------*/
330 /* Open merge source.                                                       */
331 /*--------------------------------------------------------------------------*/
332 static void
333 gl_merge_text_open (glMerge *merge)
334 {
335         glMergeText *merge_text;
336         gchar       *src;
337
338         GList       *line1_fields;
339         GList       *p;
340
341         merge_text = GL_MERGE_TEXT (merge);
342
343         src = gl_merge_get_src (merge);
344
345         if (src != NULL)
346         {
347                 merge_text->priv->fp = fopen (src, "r");
348                 g_free (src);
349
350                 clear_keys (merge_text);
351                 merge_text->priv->n_fields_max = 0;
352
353                 if ( merge_text->priv->line1_has_keys )
354                 {
355                         /*
356                          * Extract keys from first line and discard line
357                          */
358
359                         line1_fields = parse_line (merge_text->priv->fp, merge_text->priv->delim);
360                         for ( p = line1_fields; p != NULL; p = p->next )
361                         {
362                                 g_ptr_array_add (merge_text->priv->keys, g_strdup (p->data));
363                         }
364                         free_fields (&line1_fields);
365                 }
366
367         }
368
369
370 }
371
372
373 /*--------------------------------------------------------------------------*/
374 /* Close merge source.                                                      */
375 /*--------------------------------------------------------------------------*/
376 static void
377 gl_merge_text_close (glMerge *merge)
378 {
379         glMergeText *merge_text;
380
381         merge_text = GL_MERGE_TEXT (merge);
382
383         if (merge_text->priv->fp != NULL) {
384
385                 fclose (merge_text->priv->fp);
386                 merge_text->priv->fp = NULL;
387
388         }
389 }
390
391
392 /*--------------------------------------------------------------------------*/
393 /* Get next record from merge source, NULL if no records left (i.e EOF)     */
394 /*--------------------------------------------------------------------------*/
395 static glMergeRecord *
396 gl_merge_text_get_record (glMerge *merge)
397 {
398         glMergeText   *merge_text;
399         gchar          delim;
400         FILE          *fp;
401         glMergeRecord *record;
402         GList         *fields, *p;
403         gint           i_field;
404         glMergeField  *field;
405
406         merge_text = GL_MERGE_TEXT (merge);
407
408         delim = merge_text->priv->delim;
409         fp    = merge_text->priv->fp;
410
411         fields = parse_line (fp, delim);
412         if ( fields == NULL ) {
413                 return NULL;
414         }
415
416         record = g_new0 (glMergeRecord, 1);
417         record->select_flag = TRUE;
418         for (p=fields, i_field=0; p != NULL; p=p->next, i_field++) {
419
420                 field = g_new0 (glMergeField, 1);
421                 field->key = key_from_index (merge_text, i_field);
422 #ifndef CSV_ALWAYS_UTF8
423                 field->value = g_locale_to_utf8 (p->data, -1, NULL, NULL, NULL);
424 #else
425                 field->value = g_strdup (p->data);
426 #endif
427
428                 record->field_list = g_list_append (record->field_list, field);
429         }
430         free_fields (&fields);
431
432         if ( i_field > merge_text->priv->n_fields_max )
433         {
434                 merge_text->priv->n_fields_max = i_field;
435         }
436
437         return record;
438 }
439
440
441 /*---------------------------------------------------------------------------*/
442 /* Copy merge_text specific fields.                                          */
443 /*---------------------------------------------------------------------------*/
444 static void
445 gl_merge_text_copy (glMerge *dst_merge,
446                     glMerge *src_merge)
447 {
448         glMergeText *dst_merge_text;
449         glMergeText *src_merge_text;
450         gint         i;
451
452         dst_merge_text = GL_MERGE_TEXT (dst_merge);
453         src_merge_text = GL_MERGE_TEXT (src_merge);
454
455         dst_merge_text->priv->delim          = src_merge_text->priv->delim;
456         dst_merge_text->priv->line1_has_keys = src_merge_text->priv->line1_has_keys;
457
458         for ( i=0; i < src_merge_text->priv->keys->len; i++ )
459         {
460                 g_ptr_array_add (dst_merge_text->priv->keys,
461                                  g_strdup ((gchar *)g_ptr_array_index (src_merge_text->priv->keys, i)));
462         }
463
464         dst_merge_text->priv->n_fields_max   = src_merge_text->priv->n_fields_max;
465 }
466
467
468 /*---------------------------------------------------------------------------*/
469 /* PRIVATE.  Parse line.                                                     */
470 /*                                                                           */
471 /* Attempt to be a robust parser of various CSV (and similar) formats.       */
472 /*                                                                           */
473 /* Split into fields, accounting for:                                        */
474 /*   - delimeters may be embedded in quoted text (")                         */
475 /*   - delimeters may be "escaped" by a leading backslash (\)                */
476 /*   - quotes may be embedded in quoted text as two adjacent quotes ("")     */
477 /*   - quotes may be "escaped" either within or outside of quoted text.      */
478 /*   - newlines may be embedded in quoted text, allowing a field to span     */
479 /*     more than one line.                                                   */
480 /*                                                                           */
481 /* This function does not do any parsing of the individual fields, other     */
482 /* than to correctly interpet delimeters.  Actual parsing of the individual  */
483 /* fields is done in parse_field().                                          */
484 /*                                                                           */
485 /* Returns a list of fields.  A blank line is considered a line with one     */
486 /* empty field.  Returns empty (NULL) when done.                             */
487 /*---------------------------------------------------------------------------*/
488 static GList *
489 parse_line (FILE  *fp,
490             gchar  delim )
491 {
492         GList *list = NULL;
493         GString *string;
494         gint c;
495         enum { BEGIN, NORMAL, QUOTED, QUOTED_QUOTE1,
496                NORMAL_ESCAPED, QUOTED_ESCAPED, DONE } state;
497
498         if (fp == NULL) {
499                 return NULL;
500         }
501                
502         state = BEGIN;
503         string = g_string_new( "" );
504         while ( state != DONE ) {
505                 c=getc (fp);
506
507                 switch (state) {
508
509                 case BEGIN:
510                         if ( c == delim )
511                         {
512                                 /* first field is empty. */
513                                 list = g_list_append (list, g_strdup (""));
514                                 state = NORMAL;
515                                 break;
516                         }
517                         switch (c) {
518                         case '"':
519                                 string = g_string_append_c (string, c);
520                                 state = QUOTED;
521                                 break;
522                         case '\\':
523                                 string = g_string_append_c (string, c);
524                                 state = NORMAL_ESCAPED;
525                                 break;
526                         case '\n':
527                                 /* treat as one empty field. */
528                                 list = g_list_append (list, g_strdup (""));
529                                 state = DONE;
530                                 break;
531                         case EOF:
532                                 /* end of file, no more lines. */
533                                 state = DONE;
534                                 break;
535                         default:
536                                 string = g_string_append_c (string, c);
537                                 state = NORMAL;
538                                 break;
539                         }
540                         break;
541
542                 case NORMAL:
543                         if ( c == delim )
544                         {
545                                 list = g_list_append (list, parse_field (string->str));
546                                 string = g_string_assign( string, "" );
547                                 state = NORMAL;
548                                 break;
549                         }
550                         switch (c) {
551                         case '"':
552                                 string = g_string_append_c (string, c);
553                                 state = QUOTED;
554                                 break;
555                         case '\\':
556                                 string = g_string_append_c (string, c);
557                                 state = NORMAL_ESCAPED;
558                                 break;
559                         case '\n':
560                         case EOF:
561                                 list = g_list_append (list, parse_field (string->str));
562                                 state = DONE;
563                                 break;
564                         default:
565                                 string = g_string_append_c (string, c);
566                                 state = NORMAL;
567                                 break;
568                         }
569                         break;
570
571                 case QUOTED:
572                         switch (c) {
573                         case '"':
574                                 string = g_string_append_c (string, c);
575                                 state = QUOTED_QUOTE1;
576                                 break;
577                         case '\\':
578                                 string = g_string_append_c (string, c);
579                                 state = QUOTED_ESCAPED;
580                                 break;
581                         case EOF:
582                                 /* File ended mid way through quoted item */
583                                 list = g_list_append (list, parse_field (string->str));
584                                 state = DONE;
585                                 break;
586                         default:
587                                 string = g_string_append_c (string, c);
588                                 break;
589                         }
590                         break;
591
592                 case QUOTED_QUOTE1:
593                         if ( c == delim )
594                         {
595                                 list = g_list_append (list, parse_field (string->str));
596                                 string = g_string_assign( string, "" );
597                                 state = NORMAL;
598                                 break;
599                         }
600                         switch (c) {
601                         case '"':
602                                 /* insert quotes in string, stay quoted. */
603                                 string = g_string_append_c (string, c);
604                                 state = QUOTED;
605                                 break;
606                         case '\n':
607                         case EOF:
608                                 /* line or file ended after quoted item */
609                                 list = g_list_append (list, parse_field (string->str));
610                                 state = DONE;
611                                 break;
612                         default:
613                                 string = g_string_append_c (string, c);
614                                 state = NORMAL;
615                                 break;
616                         }
617                         break;
618
619                 case NORMAL_ESCAPED:
620                         switch (c) {
621                         case EOF:
622                                 /* File ended mid way through quoted item */
623                                 list = g_list_append (list, parse_field (string->str));
624                                 state = DONE;
625                                 break;
626                         default:
627                                 string = g_string_append_c (string, c);
628                                 state = NORMAL;
629                                 break;
630                         }
631                         break;
632
633                 case QUOTED_ESCAPED:
634                         switch (c) {
635                         case EOF:
636                                 /* File ended mid way through quoted item */
637                                 list = g_list_append (list, parse_field (string->str));
638                                 state = DONE;
639                                 break;
640                         default:
641                                 string = g_string_append_c (string, c);
642                                 state = QUOTED;
643                                 break;
644                         }
645                         break;
646
647                 default:
648                         g_assert_not_reached();
649                         break;
650                 }
651
652         }
653         g_string_free( string, TRUE );
654
655         return list;
656 }
657
658
659 /*---------------------------------------------------------------------------*/
660 /* PRIVATE.  Parse field.                                                    */
661 /*                                                                           */
662 /*  - Strip leading and trailing white space, unless quoted.                 */
663 /*  - Strip CR, unless escaped.                                              */
664 /*  - Expand '\n' and '\t' into newline and tab characters.                  */
665 /*  - Remove quotes, unless escaped (\" anywhere or "" within quotes)        */
666 /*---------------------------------------------------------------------------*/
667 static gchar *
668 parse_field (gchar  *raw_field)
669 {
670         GString *string;
671         gchar   *pass1_field, *c, *field;
672         enum { NORMAL, NORMAL_ESCAPED, QUOTED, QUOTED_ESCAPED, QUOTED_QUOTE1} state;
673
674
675         /*
676          * Pass 1: remove leading and trailing spaces.
677          */
678         pass1_field = g_strdup (raw_field);
679         g_strstrip (pass1_field);
680
681         /*
682          * Pass 2: resolve quoting and escaping.
683          */
684         state = NORMAL;
685         string = g_string_new( "" );
686         for ( c=pass1_field; *c != 0; c++ )
687         {
688                 switch (state) {
689
690                 case NORMAL:
691                         switch (*c) {
692                         case '\\':
693                                 state = NORMAL_ESCAPED;
694                                 break;
695                         case '"':
696                                 state = QUOTED;
697                                 break;
698                         case '\r':
699                                 /* Strip CR. */
700                                 break;
701                         default:
702                                 string = g_string_append_c (string, *c);
703                                 break;
704                         }
705                         break;
706
707                 case NORMAL_ESCAPED:
708                         switch (*c) {
709                         case 'n':
710                                 string = g_string_append_c (string, '\n');
711                                 state = NORMAL;
712                                 break;
713                         case 't':
714                                 string = g_string_append_c (string, '\t');
715                                 state = NORMAL;
716                                 break;
717                         default:
718                                 string = g_string_append_c (string, *c);
719                                 state = NORMAL;
720                                 break;
721                         }
722                         break;
723
724                 case QUOTED:
725                         switch (*c) {
726                         case '\\':
727                                 state = QUOTED_ESCAPED;
728                                 break;
729                         case '"':
730                                 state = QUOTED_QUOTE1;
731                                 break;
732                         case '\r':
733                                 /* Strip CR. */
734                                 break;
735                         default:
736                                 string = g_string_append_c (string, *c);
737                                 break;
738                         }
739                         break;
740
741                 case QUOTED_ESCAPED:
742                         switch (*c) {
743                         case 'n':
744                                 string = g_string_append_c (string, '\n');
745                                 state = QUOTED;
746                                 break;
747                         case 't':
748                                 string = g_string_append_c (string, '\t');
749                                 state = QUOTED;
750                                 break;
751                         default:
752                                 string = g_string_append_c (string, *c);
753                                 state = QUOTED;
754                                 break;
755                         }
756                         break;
757
758                 case QUOTED_QUOTE1:
759                         switch (*c) {
760                         case '"':
761                                 /* insert quotes in string, stay quoted. */
762                                 string = g_string_append_c (string, *c);
763                                 state = QUOTED;
764                                 break;
765                         case '\r':
766                                 /* Strip CR, return to QUOTED. */
767                                 state = QUOTED;
768                                 break;
769                         default:
770                                 string = g_string_append_c (string, *c);
771                                 state = NORMAL;
772                                 break;
773                         }
774                         break;
775
776                 default:
777                         g_assert_not_reached();
778                         break;
779                 }
780
781         }
782
783         field = g_strdup (string->str);
784         g_string_free( string, TRUE );
785         g_free (pass1_field);
786
787         return field;
788 }
789
790
791 /*---------------------------------------------------------------------------*/
792 /* Free list of fields.                                                      */
793 /*---------------------------------------------------------------------------*/
794 void
795 free_fields (GList ** list)
796 {
797         GList *p;
798
799         for (p = *list; p != NULL; p = p->next) {
800                 g_free (p->data);
801                 p->data = NULL;
802         }
803
804         g_list_free (*list);
805         *list = NULL;
806 }
807
808
809
810 /*
811  * Local Variables:       -- emacs
812  * mode: C                -- emacs
813  * c-basic-offset: 8      -- emacs
814  * tab-width: 8           -- emacs
815  * indent-tabs-mode: nil  -- emacs
816  * End:                   -- emacs
817  */