#!/usr/bin/python3
# Copyright 2008 The RE2 Authors.  All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

"""Generate C++ tables for Unicode Script and Category groups."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import unicode

_header = """
// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
// make_unicode_groups.py >unicode_groups.cc

#include "re2/unicode_groups.h"

namespace re2 {

"""

_trailer = """

}  // namespace re2

"""

n16 = 0
n32 = 0

def MakeRanges(codes):
  """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
  ranges = []
  last = -100
  for c in codes:
    if c == last+1:
      ranges[-1][1] = c
    else:
      ranges.append([c, c])
    last = c
  return ranges

def PrintRanges(type, name, ranges):
  """Print the ranges as an array of type named name."""
  print("static const %s %s[] = {" % (type, name))
  for lo, hi in ranges:
    print("\t{ %d, %d }," % (lo, hi))
  print("};")

# def PrintCodes(type, name, codes):
#   """Print the codes as an array of type named name."""
#   print("static %s %s[] = {" % (type, name))
#   for c in codes:
#     print("\t%d," % (c,))
#   print("};")

def PrintGroup(name, codes):
  """Print the data structures for the group of codes.
  Return a UGroup literal for the group."""

  # See unicode_groups.h for a description of the data structure.

  # Split codes into 16-bit ranges and 32-bit ranges.
  range16 = MakeRanges([c for c in codes if c < 65536])
  range32 = MakeRanges([c for c in codes if c >= 65536])

  # Pull singleton ranges out of range16.
  # code16 = [lo for lo, hi in range16 if lo == hi]
  # range16 = [[lo, hi] for lo, hi in range16 if lo != hi]

  global n16
  global n32
  n16 += len(range16)
  n32 += len(range32)

  ugroup = "{ \"%s\", +1" % (name,)
  # if len(code16) > 0:
  #   PrintCodes("uint16_t", name+"_code16", code16)
  #   ugroup += ", %s_code16, %d" % (name, len(code16))
  # else:
  #   ugroup += ", 0, 0"
  if len(range16) > 0:
    PrintRanges("URange16", name+"_range16", range16)
    ugroup += ", %s_range16, %d" % (name, len(range16))
  else:
    ugroup += ", 0, 0"
  if len(range32) > 0:
    PrintRanges("URange32", name+"_range32", range32)
    ugroup += ", %s_range32, %d" % (name, len(range32))
  else:
    ugroup += ", 0, 0"
  ugroup += " }"
  return ugroup

def main():
  categories = unicode.Categories()
  scripts = unicode.Scripts()
  print(_header)
  ugroups = []
  for name in sorted(categories):
    ugroups.append(PrintGroup(name, categories[name]))
  for name in sorted(scripts):
    ugroups.append(PrintGroup(name, scripts[name]))
  print("// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32))
  print("const UGroup unicode_groups[] = {")
  ugroups.sort()
  for ug in ugroups:
    print("\t%s," % (ug,))
  print("};")
  print("const int num_unicode_groups = %d;" % (len(ugroups),))
  print(_trailer)

if __name__ == '__main__':
  main()