libcbor  0.5.0
libcbor is a C library for parsing and generating CBOR, the general-purpose schema-less binary data format.
unicode.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2014-2017 Pavel Kalvoda <me@pavelkalvoda.com>
3  *
4  * libcbor is free software; you can redistribute it and/or modify
5  * it under the terms of the MIT license. See LICENSE for details.
6  */
7 
8 #include "unicode.h"
9 
10 #define UTF8_ACCEPT 0
11 #define UTF8_REJECT 1
12 
13 static const uint8_t utf8d[] = {
14  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 00..1f */
15  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 20..3f */
16  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 40..5f */
17  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 60..7f */
18  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /* 80..9f */
19  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* a0..bf */
20  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* c0..df */
21  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, /* e0..ef */
22  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, /* f0..ff */
23  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, /* s0..s0 */
24  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, /* s1..s2 */
25  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, /* s3..s4 */
26  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, /* s5..s6 */
27  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* s7..s8 */
28 };
29 
30 /* Copyright of this function: (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> */
31 /* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
32 uint32_t _cbor_unicode_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
33  uint32_t type = utf8d[byte];
34 
35  *codep = (*state != UTF8_ACCEPT) ?
36  (byte & 0x3fu) | (*codep << 6) :
37  (0xff >> type) & (byte);
38 
39  *state = utf8d[256 + *state * 16 + type];
40  return *state;
41 }
42 
43 size_t _cbor_unicode_codepoint_count(cbor_data source, size_t source_length, struct _cbor_unicode_status * status)
44 {
45  *status = (struct _cbor_unicode_status) { .location = 0, .status = _CBOR_UNICODE_OK };
46  uint32_t codepoint, state = UTF8_ACCEPT, res;
47  size_t pos = 0, count = 0;
48 
49  for (; pos < source_length; pos++)
50  {
51  res = _cbor_unicode_decode(&state, &codepoint, source[pos]);
52 
53  if (res == UTF8_ACCEPT) {
54  count++;
55  } else if (res == UTF8_REJECT) {
56  goto error;
57  }
58  }
59 
60  /* Unfinished multibyte codepoint */
61  if (state != UTF8_ACCEPT)
62  goto error;
63 
64  return count;
65 
66  error:
67  *status = (struct _cbor_unicode_status) { .location = pos, .status = _CBOR_UNICODE_BADCP };
68  return -1;
69 }
#define UTF8_REJECT
Definition: unicode.c:11
uint32_t _cbor_unicode_decode(uint32_t *state, uint32_t *codep, uint32_t byte)
Definition: unicode.c:32
#define UTF8_ACCEPT
Definition: unicode.c:10
Signals unicode validation error and possibly its location.
Definition: unicode.h:23
const unsigned char * cbor_data
Definition: data.h:20
size_t _cbor_unicode_codepoint_count(cbor_data source, size_t source_length, struct _cbor_unicode_status *status)
Definition: unicode.c:43