changeset 563:9f6c46b77910

initial revision
author mortenp
date Mon, 04 Sep 2006 07:21:38 +1000
parents 0e6215f03447
children 8de4095ec6ea
files include/unistr.h listtexts/digest src/unistr.c
diffstat 3 files changed, 513 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/unistr.h	Mon Sep 04 07:21:38 2006 +1000
@@ -0,0 +1,47 @@
+/* Copyright (C) 2005 Morten K. Poulsen <morten at afdelingp.dk>
+ *
+ * $Id$
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef UNISTR_H
+#define UNISTR_H
+
+typedef unsigned int unistr_char;
+
+typedef struct _unistr {
+	size_t len;
+	size_t alloc_len;
+	unistr_char *chars;
+} unistr;
+
+unistr *unistr_new(void);
+void unistr_free(unistr *str);
+int unistr_cmp(unistr *str1, unistr *str2);
+unistr *unistr_dup(unistr *str);
+void unistr_append_char(unistr *str, unistr_char uc);
+void unistr_append_usascii(unistr *str, char *binary, size_t bin_len);
+void unistr_append_utf8(unistr *str, char *binary, size_t bin_len);
+void unistr_append_iso88591(unistr *str, char *binary, size_t bin_len);
+void unistr_dump(unistr *str);
+char *unistr_to_utf8(unistr *str);
+char *unistr_header_to_utf8(char *str);
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/listtexts/digest	Mon Sep 04 07:21:38 2006 +1000
@@ -0,0 +1,5 @@
+Subject: Digest of $listaddr$ issue $digestissue$ ($digestinterval$)
+
+Topics (messages $digestfirst$ throught $digestlast$):
+
+$digestthreads$
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/unistr.c	Mon Sep 04 07:21:38 2006 +1000
@@ -0,0 +1,461 @@
+/* Copyright (C) 2005 Morten K. Poulsen <morten at afdelingp.dk>
+ *
+ * $Id$
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <ctype.h>
+
+#include "mlmmj.h"
+#include "unistr.h"
+#include "log_error.h"
+#include "memory.h"
+
+
+unistr *unistr_new(void)
+{
+	unistr *ret;
+
+	ret = mymalloc(sizeof(unistr));
+	ret->len = 0;
+	ret->alloc_len = 64;
+	ret->chars = mymalloc(ret->alloc_len * sizeof(unistr_char));
+
+	return ret;
+}
+
+
+void unistr_free(unistr *str)
+{
+	if (!str)
+		return;
+	myfree(str->chars);
+	myfree(str);
+}
+
+
+int unistr_cmp(unistr *str1, unistr *str2)
+{
+	unsigned int i;
+
+	for (i=0; i<str1->len; i++) {
+		if (str1->chars[i] < str2->chars[i]) {
+			return -1;
+		} else if (str1->chars[i] > str2->chars[i]) {
+			return 1;
+		}
+	}
+	if (str2->len > str1->len) {
+		return 1;
+	}
+	return 0;
+}
+
+
+unistr *unistr_dup(unistr *str)
+{
+	unistr *ret;
+	unsigned int i;
+
+	ret = unistr_new();
+	for (i=0; i<str->len; i++) {
+		unistr_append_char(ret, str->chars[i]);
+	}
+
+	return ret;
+}
+
+
+void unistr_append_char(unistr *str, unistr_char uc)
+{
+	if (str->len >= str->alloc_len) {
+		str->alloc_len *= 2;
+		str->chars = myrealloc(str->chars, str->alloc_len * sizeof(unistr_char));
+	}
+	str->chars[str->len++] = uc;
+}
+
+
+void unistr_append_usascii(unistr *str, char *binary, size_t bin_len)
+{
+	unsigned int i;
+
+	for (i=0; i<bin_len; i++) {
+		if ((unsigned char)binary[i] > 0x7F) {
+			unistr_append_char(str, '?');
+		} else {
+			unistr_append_char(str, (unsigned char)binary[i]);
+		}
+	}
+}
+
+
+void unistr_append_utf8(unistr *str, char *binary, size_t bin_len)
+{
+	unsigned int i, j;
+	unistr_char ch;
+	unsigned char *bin = (unsigned char *)binary;
+
+	for (i=0; i<bin_len; i++) {
+		if (bin[i] <= 0x7F) {  /* 1 */
+			unistr_append_char(str, bin[i]);
+		} else {
+			if ((bin[i] & 224) == 192) {  /* 2 */
+				ch = bin[i] & 31;
+				j = 1;
+			} else if ((bin[i] & 240) == 224) {  /* 3 */
+				ch = bin[i] & 15;
+				j = 2;
+			} else if ((bin[i] & 248) == 240) {  /* 4 */
+				ch = bin[i] & 7;
+				j = 3;
+			} else if ((bin[i] & 252) == 248) {  /* 5 */
+				ch = bin[i] & 3;
+				j = 4;
+			} else if ((bin[i] & 254) == 252) {  /* 6 */
+				ch = bin[i] & 1;
+				j = 5;
+			} else {
+				/* invalid byte sequence */
+				unistr_append_char(str, '?');
+				continue;
+			}
+			if (ch == 0) {
+				/* invalid encoding, no data bits set in first byte */
+				unistr_append_char(str, '?');
+				continue;
+			}
+			for (;j>0; j--) {
+				i++;
+				ch <<= 6;
+				if ((bin[i] & 192) != 128) {
+					/* invalid byte sequence */
+					ch = '?';
+					break;
+				}
+				ch |= bin[i] & 63;
+			}
+			unistr_append_char(str, ch);
+		}
+	}
+}
+
+
+void unistr_append_iso88591(unistr *str, char *binary, size_t bin_len)
+{
+	unsigned int i;
+
+	for (i=0; i<bin_len; i++) {
+		if (binary[i] == 0x00) {
+			unistr_append_char(str, '?');
+		} else {
+			unistr_append_char(str, (unsigned char)binary[i]);
+		}
+	}
+}
+
+
+void unistr_dump(unistr *str)
+{
+	unsigned int i;
+
+	printf("unistr_dump(%p)\n", (void *)str);
+	printf(" ->len = %d\n", str->len);
+	printf(" ->alloc_len = %d\n", str->alloc_len);
+	printf(" ->chars [ ");
+	for (i=0; i<str->len; i++) {
+		if ((str->chars[i] <= 0x7F) && (str->chars[i] != '\n')) {
+			printf("'%c' ", str->chars[i]);
+		} else {
+			printf("0x%02X ", str->chars[i]);
+		}
+	}
+	printf("]\n");
+}
+
+
+char *unistr_to_utf8(unistr *str)
+{
+	unsigned int i;
+	size_t len = 0;
+	char *ret;
+	char *p;
+
+	for (i=0; i<str->len; i++) {
+		if (str->chars[i] <= 0x7F) {
+			len++;
+		} else if (str->chars[i] <= 0x7FF) {
+			len += 2;
+		} else if (str->chars[i] <= 0xFFFF) {
+			len += 3;
+		} else if (str->chars[i] <= 0x1FFFFF) {
+			len += 4;
+		} else if (str->chars[i] <= 0x3FFFFFF) {
+			len += 5;
+		} else if (str->chars[i] <= 0x7FFFFFFF) {
+			len += 6;
+		} else {
+			errno = 0;
+			log_error(LOG_ARGS, "unistr_to_utf8(): can not utf-8 encode"
+					"U+%04X", str->chars[i]);
+			return mystrdup("");
+		}
+	}
+	len++;  /* NUL */
+
+	ret = mymalloc(len);
+	p = ret;
+
+	for (i=0; i<str->len; i++) {
+		if (str->chars[i] <= 0x7F) {  /* 1 */
+			*(p++) = str->chars[i];
+		} else if (str->chars[i] <= 0x7FF) {  /* 2 */
+			*(p++) = 192 + ((str->chars[i] & 1984) >> 6);
+			*(p++) = 128 + (str->chars[i] & 63);
+		} else if (str->chars[i] <= 0xFFFF) {  /* 3 */
+			*(p++) = 224 + ((str->chars[i] & 61440) >> 12);
+			*(p++) = 128 + ((str->chars[i] & 4032) >> 6);
+			*(p++) = 128 + (str->chars[i] & 63);
+		} else if (str->chars[i] <= 0x1FFFFF) {  /* 4 */
+			*(p++) = 240 + ((str->chars[i] & 1835008) >> 18);
+			*(p++) = 128 + ((str->chars[i] & 258048) >> 12);
+			*(p++) = 128 + ((str->chars[i] & 4032) >> 6);
+			*(p++) = 128 + (str->chars[i] & 63);
+		} else if (str->chars[i] <= 0x3FFFFFF) {  /* 5 */
+			*(p++) = 248 + ((str->chars[i] & 50331648) >> 24);
+			*(p++) = 128 + ((str->chars[i] & 16515072) >> 18);
+			*(p++) = 128 + ((str->chars[i] & 258048) >> 12);
+			*(p++) = 128 + ((str->chars[i] & 4032) >> 6);
+			*(p++) = 128 + (str->chars[i] & 63);
+		} else if (str->chars[i] <= 0x7FFFFFFF) {  /* 6 */
+			*(p++) = 252 + ((str->chars[i] & 1073741824) >> 30);
+			*(p++) = 128 + ((str->chars[i] & 1056964608) >> 24);
+			*(p++) = 128 + ((str->chars[i] & 16515072) >> 18);
+			*(p++) = 128 + ((str->chars[i] & 258048) >> 12);
+			*(p++) = 128 + ((str->chars[i] & 4032) >> 6);
+			*(p++) = 128 + (str->chars[i] & 63);
+		} else {
+			errno = 0;
+			log_error(LOG_ARGS, "unistr_to_utf8(): can not utf-8 encode"
+					"U+%04X", str->chars[i]);
+		}
+	}
+	*(p++) = '\0';
+
+	return ret;
+}
+
+
+static int hexval(char ch)
+{
+	ch = tolower(ch);
+
+	if ((ch >= 'a') && (ch <= 'f')) {
+		return 10 + ch - 'a';
+	}
+
+	if ((ch >= '0') && (ch <= '9')) {
+		return ch - '0';
+	}
+
+	return 0;
+}
+
+
+static void decode_qp(char *str, char **binary, size_t *bin_len)
+{
+	int i;
+
+	/* decoded string will never be longer, and we don't include a NUL */
+	*binary = mymalloc(strlen(str));
+	*bin_len = 0;
+
+	for (i=0; str[i]; i++) {
+		if ((str[i] == '=') && isxdigit(str[i+1]) && isxdigit(str[i+2])) {
+			(*binary)[(*bin_len)++] = (hexval(str[i+1]) << 4) + hexval(str[i+2]);
+			i += 2;
+		} else if (str[i] == '_') {
+			(*binary)[(*bin_len)++] = 0x20;
+		} else {
+			(*binary)[(*bin_len)++] = str[i];
+		}
+	}
+}
+
+
+static void decode_base64(char *str, char **binary, size_t *bin_len)
+{
+	int tab[] = {
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
+		52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
+		-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
+		15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
+		-1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+		41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+		-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+	};
+	size_t len;
+	unsigned int i;
+	unsigned int out;
+	int out_numbits;
+	int val;
+
+	/* decoded string will never be longer, and we don't include a NUL */
+	len = strlen(str);
+	*binary = mymalloc(len);
+	*bin_len = 0;
+
+	out = 0;
+	out_numbits = 0;
+	for (i=0; i<strlen(str); i++) {
+		val = tab[(unsigned char)str[i]];
+		if (val == -1)
+			continue;
+		out <<= 6;
+		out |= val;
+		out_numbits += 6;
+		if (out_numbits >= 8) {
+			(*binary)[(*bin_len)++] = (out >> (out_numbits - 8)) & 255;
+			out_numbits -= 8;
+		}
+	}
+}
+
+
+static void header_decode_word(char *word, unistr *ret)
+{
+	char *my_word;
+	char *charset, *encoding, *string, *end;
+	char *binary;
+	size_t bin_len;
+
+
+	if ((word[0] != '=') || (word[1] != '?')) {
+		unistr_append_usascii(ret, word, strlen(word));
+		return;
+	}
+
+	my_word = mystrdup(word);
+
+	charset = my_word + 2;
+
+	if ((encoding = strchr(charset, '?')) == NULL) {
+		/* missing encoding */
+		unistr_append_usascii(ret, "???", 3);
+		myfree(my_word);
+		return;
+	}
+	*(encoding++) = '\0';
+
+	if ((string = strchr(encoding, '?')) == NULL) {
+		/* missing string */
+		unistr_append_usascii(ret, "???", 3);
+		myfree(my_word);
+		return;
+	}
+	*(string++) = '\0';
+
+	if ((end = strchr(string, '?')) == NULL) {
+		/* missing end */
+		unistr_append_usascii(ret, "???", 3);
+		myfree(my_word);
+		return;
+	}
+	*(end++) = '\0';
+	if ((end[0] != '=') || (end[1] != '\0')) {
+		/* broken end */
+		unistr_append_usascii(ret, "???", 3);
+		myfree(my_word);
+		return;
+	}
+
+	if (tolower(encoding[0]) == 'q') {
+		decode_qp(string, &binary, &bin_len);
+	} else if (tolower(encoding[0]) == 'b') {
+		decode_base64(string, &binary, &bin_len);
+	} else {
+		/* unknown encoding */
+		unistr_append_usascii(ret, "???", 3);
+		myfree(my_word);
+		return;
+	}
+
+	if (strcasecmp(charset, "us-ascii") == 0) {
+		unistr_append_usascii(ret, binary, bin_len);
+	} else if (strcasecmp(charset, "utf-8") == 0) {
+		unistr_append_utf8(ret, binary, bin_len);
+	} else if (strcasecmp(charset, "iso-8859-1") == 0) {
+		unistr_append_iso88591(ret, binary, bin_len);
+	} else {
+		/* unknown charset */
+		unistr_append_usascii(ret, "???", 3);
+	}
+
+	myfree(my_word);
+	myfree(binary);
+}
+
+
+/* IN: "=?iso-8859-1?Q?hyggem=F8de?= torsdag"
+ * OUT: "hyggem\xC3\xB8de torsdag"
+ */
+char *unistr_header_to_utf8(char *str)
+{
+	char *my_str;
+	char *word;
+	char *p;
+	unistr *us;
+	char *ret;
+
+	my_str = mystrdup(str);
+	us = unistr_new();
+
+	word = strtok_r(my_str, " \t\n", &p);
+	while (word) {
+		header_decode_word(word, us);
+		word = strtok_r(NULL, " \t\n", &p);
+		if (word)
+			unistr_append_char(us, ' ');
+	}
+
+	myfree(my_str);
+
+	ret = unistr_to_utf8(us);
+	unistr_free(us);
+
+	return ret;
+}