// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
 * Ceph - scalable distributed file system
 *
 * Copyright (C) 2011 New Dream Network
 *
 * This is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License version 2.1, as published by the Free Software
 * Foundation.  See file COPYING.
 *
 */
#include "common/utf8.h"

#include <string.h>

static int high_bits_set(int c)
{
	int ret = 0;
	while (1) {
		if ((c & 0x80) != 0x080)
			break;
		c <<= 1;
		++ret;
	}
	return ret;
}

/* Encode a 31-bit UTF8 code point to 'buf'.
 * Assumes buf is of size MAX_UTF8_SZ
 * Returns -1 on failure; number of bytes in the encoded value otherwise.
 */
int encode_utf8(unsigned long u, unsigned char *buf)
{
	int i;
	unsigned long max_val[MAX_UTF8_SZ] = {
		0x0000007ful, 0x000007fful, 0x0000fffful,
		0x001ffffful, 0x03fffffful, 0x7ffffffful
	};
	static const int MAX_VAL_SZ = sizeof(max_val) / sizeof(max_val[0]);

	for (i = 0; i < MAX_VAL_SZ; ++i) {
		if (u <= max_val[i])
			break;
	}
	if (i == MAX_VAL_SZ) {
		// This code point is too big to encode.
		return -1;
	}

	if (i == 0) {
		buf[0] = u;
	}
	else {
		signed int j;
		for (j = i; j > 0; --j) {
			buf[j] = 0x80 | (u & 0x3f);
			u >>= 6;
		}

		unsigned char mask = ~(0xFF >> (i + 1));
		buf[0] = mask | u;
	}

	return i + 1;
}

/*
 * Decode a UTF8 character from an array of bytes. Return character code.
 * Upon error, return INVALID_UTF8_CHAR.
 */
unsigned long decode_utf8(unsigned char *buf, int nbytes)
{
	unsigned long code;
	int i, j;

	if (nbytes <= 0)
		return INVALID_UTF8_CHAR;

	if (nbytes == 1) {
		if (buf[0] >= 0x80)
			return INVALID_UTF8_CHAR;
		return buf[0];
	}

	i = high_bits_set(buf[0]);
	if (i != nbytes)
		return INVALID_UTF8_CHAR;
	code = buf[0] & (0xff >> i);
	for (j = 1; j < nbytes; ++j) {
		if ((buf[j] & 0xc0) != 0x80)
			    return INVALID_UTF8_CHAR;
		code = (code << 6) | (buf[j] & 0x3f);
	}

	// Check for invalid code points
	if (code == 0xFFFE)
	    return INVALID_UTF8_CHAR;
	if (code == 0xFFFF)
	    return INVALID_UTF8_CHAR;
	if (code >= 0xD800 && code <= 0xDFFF)
	    return INVALID_UTF8_CHAR;

	return code;
}

int check_utf8(const char *buf, int len)
{
	unsigned char u[MAX_UTF8_SZ];
	int enc_len = 0;
	int i = 0;
	while (1) {
		unsigned int c = buf[i];
		if (i >= len || c < 0x80 || (c & 0xC0) != 0x80) {
			// the start of a new character. Process what we have
			// in the buffer.
			if (enc_len > 0) {
				int re_encoded_len;
				unsigned char re_encoded[MAX_UTF8_SZ];
				unsigned long code = decode_utf8(u, enc_len);
				if (code == INVALID_UTF8_CHAR) {
					//printf("decoded to invalid utf8");
					return i + 1;
				}
				re_encoded_len = encode_utf8(code, re_encoded);
				if (enc_len != re_encoded_len) {
					//printf("originally encoded as %d bytes, "
					//	"but was re-encoded to %d!\n",
					//	enc_len, re_encoded_len);
					return i + 1;
				}
				if (memcmp(u, re_encoded, enc_len) != 0) {
					//printf("re-encoded to a different "
					//	"byte stream!");
					return i + 1;
				}
				//printf("code_point %lu\n", code);
			}
			enc_len = 0;
			if (i >= len)
				break;
			// start collecting again?
			if (c >= 0x80)
				u[enc_len++] = c;
		} else {
			if (enc_len == MAX_UTF8_SZ) {
				//printf("too many enc_len in utf character!\n");
				return i + 1;
			}
			//printf("continuation byte...\n");
			u[enc_len++] = c;
		}
		++i;
	}
	return 0;
}

int check_utf8_cstr(const char *buf)
{
	return check_utf8(buf, strlen(buf));
}

int is_control_character(int c)
{
	return (((c != 0) && (c < 0x20)) || (c == 0x7f));
}

int check_for_control_characters(const char *buf, int len)
{
	int i;
	for (i = 0; i < len; ++i) {
		if (is_control_character((int)(unsigned char)buf[i])) {
 			return i + 1;
		}
	}
	return 0;
}

int check_for_control_characters_cstr(const char *buf)
{
	return check_for_control_characters(buf, strlen(buf));
}