""" Download the latest unicode tables from https://www.unicode.org and create a .txt file containing all the names, blocks and character codes """ import sys import os import logging from urllib import request curr_path = os.path.dirname(__file__) logging.basicConfig(level=logging.DEBUG) # Be compatible with both python 2 and 3 if sys.version_info[0] >= 3: unichr = chr BASE_URL = "https://www.unicode.org/Public/UCD/latest/ucd" def get_blocks(): """ Download the info file for Unicode blocks. """ logging.info("Downloading block data...") with request.urlopen(f"{BASE_URL}/Blocks.txt") as req: content = req.read().decode() logging.info("Done") return content def get_data(): """ Download the info file for Unicode blocks. """ logging.info("Downloading character data...") with request.urlopen(f"{BASE_URL}/UnicodeData.txt") as req: content = req.read().decode() logging.info("Done") return content def clean(text): """ Remove all blank or commented lies from a string """ lines = text.strip().split("\n") clean_lines = [line.strip() for line in lines if line.strip() and line[0] != "#"] return "\n".join(clean_lines) def load_blocks(): """ Load and parse the block data and return a function that provides block search based on a character code. """ indices = [] blocks = [] block_data = clean(get_blocks()) for line in block_data.split("\n"): l, name = line.split(";") start, stop = l.split("..") indices.append((int(start, 16), int(stop, 16))) blocks.append(name.strip()) def locate_block(code, left=0, right=len(indices)): """ Binary search on an ordered list of intervals. """ half = left + (right - left) // 2 [start, end] = indices[half] if start > code: return locate_block(code, left, right=half) if end < code: return locate_block(code, half, right=right) return blocks[half] return locate_block def main(out: str = "unicode_list.txt"): """Create the file with Unicode characters. Read the character and block data and unite them to a text file containing the following fields, separated by tab characters: ` ` """ get_block = load_blocks() characters = clean(get_data()) logging.info("Parsing character data...") output = [] for line in characters.split("\n"): # Parse the needed data from the character's line attributes = line.strip().split(";") code = attributes[0] name = attributes[1] comment = attributes[10] # Convert character code to unicode try: num = int(code, 16) except ValueError: logging.warning("Could not convert %s", code) continue # Find the character's block blk = get_block(num) if blk is not None: output.append("\t".join((name, comment, code, blk))) else: logging.warning("Code %s not found in any block, char: %s", num, unichr(num)) output.append(name + "\t" + comment + "\t" + code + "\t") with open(out, "w", encoding="utf-8") as target: target.write("\n".join(output)) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument( "path", type=str, help="the output path where to save the Unicode list.", default="unicode_list.txt", ) args = parser.parse_args() main(args.path)