changeset 896:7ed55458d9e3

Use iconv to convert unknown character sets.
author Ben Schmidt
date Wed, 12 Feb 2014 09:19:03 +1100
parents ef3aff5e6255
children 65ef98d16f17
files ChangeLog configure.ac src/unistr.c
diffstat 3 files changed, 50 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Mon Feb 10 14:42:32 2014 +1100
+++ b/ChangeLog	Wed Feb 12 09:19:03 2014 +1100
@@ -1,3 +1,4 @@
+ o Use iconv to convert unknown character sets
  o Handle unfolded header lines better
  o Add a tunable for moderation request lifetime (Timo Boettcher)
  o Ensure mlmmj-send always honours tunables (e.g. relayhost)
--- a/configure.ac	Mon Feb 10 14:42:32 2014 +1100
+++ b/configure.ac	Wed Feb 12 09:19:03 2014 +1100
@@ -14,11 +14,13 @@
 # Checks for libraries.
 AC_CHECK_LIB(socket,socket)
 AC_CHECK_LIB(nsl,gethostbyname)
+AC_CHECK_LIB(iconv,iconv_open)
 
 # Checks for header files.
 AC_HEADER_STDC
 AC_CHECK_HEADERS([arpa/inet.h fcntl.h netinet/in.h stddef.h stdlib.h string.h])
 AC_CHECK_HEADERS([sys/socket.h syslog.h unistd.h time.h])
+AC_CHECK_HEADER([iconv.h])
 
 # Checks for typedefs, structures, and compiler characteristics.
 AC_C_CONST
--- a/src/unistr.c	Mon Feb 10 14:42:32 2014 +1100
+++ b/src/unistr.c	Wed Feb 12 09:19:03 2014 +1100
@@ -30,12 +30,16 @@
 #include <sys/stat.h>
 #include <sys/wait.h>
 #include <ctype.h>
+#include <iconv.h>
 
 #include "mlmmj.h"
 #include "unistr.h"
 #include "log_error.h"
 #include "memory.h"
 
+/* This is allocated on the stack, so it can't be too big. */
+#define ICONV_BUFFER_SIZE 160
+
 
 unistr *unistr_new(void)
 {
@@ -180,6 +184,48 @@
 }
 
 
+void unistr_append_iconv(unistr *str, char *binary, size_t bin_len,
+		const char * charset)
+{
+	char bytes[ICONV_BUFFER_SIZE];
+	char * buffer;
+	size_t bufferleft;
+	iconv_t cd;
+
+	cd = iconv_open("UTF-8", charset);
+	if (cd == (iconv_t)-1) {
+		unistr_append_usascii(str, "???", 3);
+		return;
+	}
+
+	while (bin_len > 0) {
+		buffer = bytes;
+		bufferleft = ICONV_BUFFER_SIZE;
+		if (iconv(cd, &binary, &bin_len, &buffer, &bufferleft) == (size_t)-1) {
+			if (errno == EILSEQ) {
+				/* illegal sequence; try to recover */
+				unistr_append_utf8(str, bytes, ICONV_BUFFER_SIZE - bufferleft);
+				unistr_append_usascii(str, "?", 1);
+				bin_len--;
+				binary++;
+				continue;
+			} else if (errno == EINVAL) {
+				/* incomplete sequence; we're done */
+				unistr_append_usascii(str, "?", 1);
+				break;
+			} else if (errno != E2BIG) {
+				/* some other error; abort */
+				unistr_append_usascii(str, "???", 1);
+				break;
+			}
+		}
+		/* success or buffer full */
+		unistr_append_utf8(str, bytes, ICONV_BUFFER_SIZE - bufferleft);
+	}
+	iconv_close(cd);
+}
+
+
 void unistr_dump(const unistr *str)
 {
 	unsigned int i;
@@ -421,8 +467,7 @@
 	} else if (strcasecmp(charset, "iso-8859-1") == 0) {
 		unistr_append_iso88591(ret, binary, bin_len);
 	} else {
-		/* unknown charset */
-		unistr_append_usascii(ret, "???", 3);
+		unistr_append_iconv(ret, binary, bin_len, charset);
 	}
 
 	myfree(my_word);