3 * Copyright (C) 2001-2009 Jim Evins <evins@snaught.com>.
5 * This file is part of gLabels.
7 * gLabels is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
12 * gLabels is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with gLabels. If not, see <http://www.gnu.org/licenses/>.
23 #include "merge-text.h"
31 #define LINE_BUF_LEN 1024
35 * The default encoding assumption is that files are in the system encoding.
36 * However, files are checked for a Unicode BOM (Byte Order Mark), which if found
37 * alters the manner in which files are handled.
39 enum UnicodeEncoding {
48 /*===========================================*/
50 /*===========================================*/
52 struct _glMergeTextPrivate {
55 gboolean line1_has_keys;
57 enum UnicodeEncoding encoding;
59 gchar char_buf[MB_LEN_MAX];
80 /*===========================================*/
82 /*===========================================*/
85 /*===========================================*/
86 /* Local function prototypes */
87 /*===========================================*/
89 static void gl_merge_text_finalize (GObject *object);
91 static void gl_merge_text_set_property (GObject *object,
96 static void gl_merge_text_get_property (GObject *object,
101 static gchar *key_from_index (glMergeText *merge_text,
103 static void clear_keys (glMergeText *merge_text);
105 static GList *gl_merge_text_get_key_list (const glMerge *merge);
106 static gchar *gl_merge_text_get_primary_key (const glMerge *merge);
107 static void gl_merge_text_open (glMerge *merge);
108 static void gl_merge_text_close (glMerge *merge);
109 static glMergeRecord *gl_merge_text_get_record (glMerge *merge);
110 static void gl_merge_text_copy (glMerge *dst_merge,
111 const glMerge *src_merge);
113 static GList *parse_line (glMergeText *merge_text,
115 static void free_fields (GList **fields);
119 /*****************************************************************************/
120 /* Boilerplate object stuff. */
121 /*****************************************************************************/
122 G_DEFINE_TYPE (glMergeText, gl_merge_text, GL_TYPE_MERGE)
126 gl_merge_text_class_init (glMergeTextClass *class)
128 GObjectClass *object_class = G_OBJECT_CLASS (class);
129 glMergeClass *merge_class = GL_MERGE_CLASS (class);
131 gl_debug (DEBUG_MERGE, "START");
133 gl_merge_text_parent_class = g_type_class_peek_parent (class);
135 object_class->set_property = gl_merge_text_set_property;
136 object_class->get_property = gl_merge_text_get_property;
138 g_object_class_install_property
141 g_param_spec_char ("delim", NULL, NULL,
143 (G_PARAM_READABLE | G_PARAM_WRITABLE)));
145 g_object_class_install_property
148 g_param_spec_boolean ("line1_has_keys", NULL, NULL,
150 (G_PARAM_READABLE | G_PARAM_WRITABLE)));
152 object_class->finalize = gl_merge_text_finalize;
154 merge_class->get_key_list = gl_merge_text_get_key_list;
155 merge_class->get_primary_key = gl_merge_text_get_primary_key;
156 merge_class->open = gl_merge_text_open;
157 merge_class->close = gl_merge_text_close;
158 merge_class->get_record = gl_merge_text_get_record;
159 merge_class->copy = gl_merge_text_copy;
161 gl_debug (DEBUG_MERGE, "END");
166 gl_merge_text_init (glMergeText *merge_text)
168 gl_debug (DEBUG_MERGE, "START");
170 merge_text->priv = g_new0 (glMergeTextPrivate, 1);
172 merge_text->priv->keys = g_ptr_array_new ();
174 gl_debug (DEBUG_MERGE, "END");
179 gl_merge_text_finalize (GObject *object)
181 glMergeText *merge_text = GL_MERGE_TEXT (object);
183 gl_debug (DEBUG_MERGE, "START");
185 g_return_if_fail (object && GL_IS_MERGE_TEXT (object));
187 clear_keys (merge_text);
188 g_ptr_array_free (merge_text->priv->keys, TRUE);
189 g_free (merge_text->priv);
191 G_OBJECT_CLASS (gl_merge_text_parent_class)->finalize (object);
193 gl_debug (DEBUG_MERGE, "END");
197 /*--------------------------------------------------------------------------*/
199 /*--------------------------------------------------------------------------*/
201 gl_merge_text_set_property (GObject *object,
206 glMergeText *merge_text;
208 merge_text = GL_MERGE_TEXT (object);
213 merge_text->priv->delim = g_value_get_schar (value);
214 gl_debug (DEBUG_MERGE, "ARG \"delim\" = \"%c\"",
215 merge_text->priv->delim);
218 case ARG_LINE1_HAS_KEYS:
219 merge_text->priv->line1_has_keys = g_value_get_boolean (value);
220 gl_debug (DEBUG_MERGE, "ARG \"line1_has_keys\" = \"%d\"",
221 merge_text->priv->line1_has_keys);
225 G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
233 /*--------------------------------------------------------------------------*/
235 /*--------------------------------------------------------------------------*/
237 gl_merge_text_get_property (GObject *object,
242 glMergeText *merge_text;
244 merge_text = GL_MERGE_TEXT (object);
249 g_value_set_schar (value, merge_text->priv->delim);
252 case ARG_LINE1_HAS_KEYS:
253 g_value_set_boolean (value, merge_text->priv->line1_has_keys);
257 G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
265 /*---------------------------------------------------------------------------*/
266 /* Lookup key name from zero based index. */
267 /*---------------------------------------------------------------------------*/
269 key_from_index (glMergeText *merge_text,
272 if ( merge_text->priv->line1_has_keys &&
273 (i_field < merge_text->priv->keys->len) )
275 return g_strdup (g_ptr_array_index (merge_text->priv->keys, i_field));
279 return g_strdup_printf ("%d", i_field+1);
284 /*---------------------------------------------------------------------------*/
285 /* Clear stored keys. */
286 /*---------------------------------------------------------------------------*/
288 clear_keys (glMergeText *merge_text)
292 for ( i = 0; i < merge_text->priv->keys->len; i++ )
294 g_free (g_ptr_array_index (merge_text->priv->keys, i));
296 merge_text->priv->keys->len = 0;
300 /*--------------------------------------------------------------------------*/
302 /*--------------------------------------------------------------------------*/
304 gl_merge_text_get_key_list (const glMerge *merge)
306 glMergeText *merge_text;
307 gint i_field, n_fields;
310 gl_debug (DEBUG_MERGE, "BEGIN");
312 merge_text = GL_MERGE_TEXT (merge);
314 if ( merge_text->priv->line1_has_keys )
316 n_fields = merge_text->priv->keys->len;
320 n_fields = merge_text->priv->n_fields_max;
324 for ( i_field=0; i_field < n_fields; i_field++ )
326 key_list = g_list_append (key_list, key_from_index(merge_text, i_field));
329 gl_debug (DEBUG_MERGE, "END");
335 /*--------------------------------------------------------------------------*/
336 /* Get "primary" key. */
337 /*--------------------------------------------------------------------------*/
339 gl_merge_text_get_primary_key (const glMerge *merge)
341 /* For now, let's always assume the first column is the primary key. */
342 return key_from_index (GL_MERGE_TEXT (merge), 0);
345 /*--------------------------------------------------------------------------*/
346 /* Read the byte order marks to determine unicode encoding, if any. */
347 /* See https://en.wikipedia.org/wiki/Byte_order_mark */
348 /*--------------------------------------------------------------------------*/
349 static enum UnicodeEncoding
350 gl_read_encoding(FILE* fp) {
351 enum UnicodeEncoding encoding;
352 gchar ch, ch2, ch3, ch4;
361 if (ch3 == '\0' && ch4 == '\0') {
371 encoding = SYSTEM_ENCODING;
373 } else if (ch == '\xfe') {
380 encoding = SYSTEM_ENCODING;
382 } else if (ch == '\0') {
386 if (ch2 == '\0' && ch3 == '\xfe' && ch4 == '\xff') {
392 encoding = SYSTEM_ENCODING;
394 } else if (ch == '\xef') {
404 encoding = SYSTEM_ENCODING;
409 encoding = SYSTEM_ENCODING;
413 encoding = SYSTEM_ENCODING;
419 * gLabels get-character routine for possibly Unicode text files.
420 * If the source has a byte order mark (BOM) indicating a Unicode file,
421 * g_iconv is used to convert input characters to GDK-standard UTF8 format.
425 gl_getc(glMergeText *merge_text) {
426 if (merge_text->priv->buf_pos < merge_text->priv->buf_len) {
427 return merge_text->priv->char_buf[merge_text->priv->buf_pos++];
428 } else if (merge_text->priv->encoding == SYSTEM_ENCODING ||
429 merge_text->priv->encoding == UTF8) {
430 return getc(merge_text->priv->fp);
433 * a UTF-16 stream might include surrogates, which encode
434 * characters in successive 16-bit units. If we read a
435 * leading surrogate, read in the trailing one as well for
443 switch (merge_text->priv->encoding) {
458 gsize nBytes = fread(wcbuf, 1, unit_len, merge_text->priv->fp);
461 if (hob_offset >= 0 && (wcbuf[hob_offset] & 0xfd) == 0xd8) {
462 nBytes += fread(wcbuf+unit_len, 1, unit_len, merge_text->priv->fp);
464 gchar* wcbufp = wcbuf;
465 outbufp = merge_text->priv->char_buf;
466 gsize buflen = sizeof(merge_text->priv->char_buf);
467 result = g_iconv(merge_text->priv->g_iconverter, &wcbufp, &nBytes,
470 g_warning("g_iconv: %s", strerror(errno));
472 merge_text->priv->buf_len = outbufp - merge_text->priv->char_buf;
473 merge_text->priv->buf_pos = 0;
474 return merge_text->priv->char_buf[merge_text->priv->buf_pos++];
479 /*--------------------------------------------------------------------------*/
480 /* Open merge source. */
481 /*--------------------------------------------------------------------------*/
483 gl_merge_text_open (glMerge *merge)
485 glMergeText *merge_text;
491 merge_text = GL_MERGE_TEXT (merge);
493 src = gl_merge_get_src (merge);
497 if (g_utf8_strlen(src, -1) == 1 && src[0] == '-') {
498 merge_text->priv->fp = stdin;
499 merge_text->priv->encoding = SYSTEM_ENCODING;
501 if ((merge_text->priv->fp = fopen (src, "r")) != NULL) {
502 merge_text->priv->encoding = gl_read_encoding(merge_text->priv->fp);
504 g_warning("gl_merge_text_open: %s (%s)",
505 strerror(errno), src);
510 gchar* in_codeset = NULL;
511 switch (merge_text->priv->encoding) {
513 case SYSTEM_ENCODING:
516 in_codeset = "UTF-16BE";
519 in_codeset = "UTF-16LE";
522 in_codeset = "UTF-32BE";
525 in_codeset = "UTF-32LE";
528 if (in_codeset != NULL) {
529 merge_text->priv->g_iconverter = g_iconv_open("UTF8", in_codeset);
530 /* Since we define both codesets, we should always be able to open the converter */
531 g_assert(merge_text->priv->g_iconverter != (GIConv)-1);
533 clear_keys (merge_text);
534 merge_text->priv->n_fields_max = 0;
536 if ( merge_text->priv->line1_has_keys )
539 * Extract keys from first line and discard line
542 line1_fields = parse_line (merge_text, merge_text->priv->delim);
543 for ( p = line1_fields; p != NULL; p = p->next )
545 g_ptr_array_add (merge_text->priv->keys, g_strdup (p->data));
547 free_fields (&line1_fields);
556 /*--------------------------------------------------------------------------*/
557 /* Close merge source. */
558 /*--------------------------------------------------------------------------*/
560 gl_merge_text_close (glMerge *merge)
562 glMergeText *merge_text;
564 merge_text = GL_MERGE_TEXT (merge);
566 if (merge_text->priv->fp != NULL) {
568 fclose (merge_text->priv->fp);
569 merge_text->priv->fp = NULL;
572 if (merge_text->priv->g_iconverter != 0) {
573 g_iconv_close(merge_text->priv->g_iconverter);
574 merge_text->priv->g_iconverter = 0;
579 /*--------------------------------------------------------------------------*/
580 /* Get next record from merge source, NULL if no records left (i.e EOF) */
581 /*--------------------------------------------------------------------------*/
582 static glMergeRecord *
583 gl_merge_text_get_record (glMerge *merge)
585 glMergeText *merge_text;
587 glMergeRecord *record;
592 merge_text = GL_MERGE_TEXT (merge);
594 delim = merge_text->priv->delim;
596 fields = parse_line (merge_text, delim);
597 if ( fields == NULL ) {
601 record = g_new0 (glMergeRecord, 1);
602 record->select_flag = TRUE;
603 for (p=fields, i_field=0; p != NULL; p=p->next, i_field++) {
605 field = g_new0 (glMergeField, 1);
606 field->key = key_from_index (merge_text, i_field);
607 #ifndef CSV_ALWAYS_UTF8
608 if (merge_text->priv->encoding == SYSTEM_ENCODING) {
609 field->value = g_locale_to_utf8 (p->data, -1, NULL, NULL, NULL);
611 field->value = g_strdup (p->data);
614 field->value = g_strdup (p->data);
617 record->field_list = g_list_append (record->field_list, field);
619 free_fields (&fields);
621 if ( i_field > merge_text->priv->n_fields_max )
623 merge_text->priv->n_fields_max = i_field;
630 /*---------------------------------------------------------------------------*/
631 /* Copy merge_text specific fields. */
632 /*---------------------------------------------------------------------------*/
634 gl_merge_text_copy (glMerge *dst_merge,
635 const glMerge *src_merge)
637 glMergeText *dst_merge_text;
638 glMergeText *src_merge_text;
641 dst_merge_text = GL_MERGE_TEXT (dst_merge);
642 src_merge_text = GL_MERGE_TEXT (src_merge);
644 dst_merge_text->priv->delim = src_merge_text->priv->delim;
645 dst_merge_text->priv->line1_has_keys = src_merge_text->priv->line1_has_keys;
647 for ( i=0; i < src_merge_text->priv->keys->len; i++ )
649 g_ptr_array_add (dst_merge_text->priv->keys,
650 g_strdup ((gchar *)g_ptr_array_index (src_merge_text->priv->keys, i)));
653 dst_merge_text->priv->n_fields_max = src_merge_text->priv->n_fields_max;
657 /*---------------------------------------------------------------------------*/
658 /* PRIVATE. Parse line. */
660 /* Attempt to be a robust parser of various CSV (and similar) formats. */
662 /* Based on CSV format described in RFC 4180 section 2. */
664 /* Additions to RFC 4180 rules: */
665 /* - delimeters and other special characters may be "escaped" by a leading */
667 /* - C escape sequences for newline (\n) and tab (\t) are also translated. */
668 /* - if quoted text is not followed by a delimeter, any additional text is */
669 /* concatenated with quoted portion. */
671 /* Returns a list of fields. A blank line is considered a line with one */
672 /* empty field. Returns empty (NULL) when done. */
673 /*---------------------------------------------------------------------------*/
675 parse_line (glMergeText* merge_text,
682 QUOTED, QUOTED_QUOTE1, QUOTED_ESCAPED,
683 SIMPLE, SIMPLE_ESCAPED,
686 if (merge_text->priv->fp == NULL) {
692 field = g_string_new( "" );
693 while ( state != DONE ) {
694 c=gl_getc (merge_text);
701 /* last field is empty. */
702 list = g_list_append (list, g_strdup (""));
710 /* end of file, no more lines. */
714 /* start a quoted field. */
718 /* simple field, but 1st character is an escape. */
719 state = SIMPLE_ESCAPED;
724 /* field is empty. */
725 list = g_list_append (list, g_strdup (""));
730 /* begining of a simple field. */
731 field = g_string_append_c (field, c);
741 /* File ended mid way through quoted item, truncate field. */
742 list = g_list_append (list, g_strdup (field->str));
746 /* Possible end of field, but could be 1st of a pair. */
747 state = QUOTED_QUOTE1;
750 /* Escape next character, or special escape, e.g. \n. */
751 state = QUOTED_ESCAPED;
754 /* Use character literally. */
755 field = g_string_append_c (field, c);
764 /* line or file ended after quoted item */
765 list = g_list_append (list, g_strdup (field->str));
769 /* second quote, insert and stay quoted. */
770 field = g_string_append_c (field, c);
774 /* ignore and go to fallback */
781 list = g_list_append (list, g_strdup (field->str));
782 field = g_string_assign( field, "" );
787 /* fallback if not a delim or another quote. */
788 field = g_string_append_c (field, c);
798 /* File ended mid way through quoted item */
799 list = g_list_append (list, g_strdup (field->str));
803 /* Decode "\n" as newline. */
804 field = g_string_append_c (field, '\n');
808 /* Decode "\t" as tab. */
809 field = g_string_append_c (field, '\t');
813 /* Use character literally. */
814 field = g_string_append_c (field, c);
824 /* line or file ended */
825 list = g_list_append (list, g_strdup (field->str));
833 /* Escape next character, or special escape, e.g. \n. */
834 state = SIMPLE_ESCAPED;
840 list = g_list_append (list, g_strdup (field->str));
841 field = g_string_assign( field, "" );
846 /* Use character literally. */
847 field = g_string_append_c (field, c);
857 /* File ended mid way through quoted item */
858 list = g_list_append (list, g_strdup (field->str));
862 /* Decode "\n" as newline. */
863 field = g_string_append_c (field, '\n');
867 /* Decode "\t" as tab. */
868 field = g_string_append_c (field, '\t');
872 /* Use character literally. */
873 field = g_string_append_c (field, c);
880 g_assert_not_reached();
885 g_string_free( field, TRUE );
891 /*---------------------------------------------------------------------------*/
892 /* Free list of fields. */
893 /*---------------------------------------------------------------------------*/
895 free_fields (GList ** list)
899 for ( p = *list; p != NULL; p = p->next )
912 * Local Variables: -- emacs
914 * c-basic-offset: 8 -- emacs
915 * tab-width: 8 -- emacs
916 * indent-tabs-mode: nil -- emacs