""" Download the latest unicode tables from https://www.unicode.org and create a .txt file containing all the names, blocks and character codes """ import os import logging from urllib import request curr_path = os.path.dirname(__file__) logging.basicConfig(level=logging.DEBUG) def get_blocks(): """ Download the info file for Unicode blocks. """ logging.info("Downloading block data...") req = request.urlopen("https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt") content = req.read().decode() logging.info("Done") return content def get_data(): """ Download the info file for Unicode blocks. """ logging.info("Downloading character data...") req = request.urlopen( "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt" ) content = req.read().decode() logging.info("Done") return content def clean(text): """ Remove all blank or commented lies from a string """ lines = text.strip().split("\n") clean_lines = [line.strip() for line in lines if line.strip() and line[0] != "#"] return "\n".join(clean_lines) def load_blocks(): """ Load and parse the block data and return a function that provides block search based on a character code. """ indices = [] blocks = [] block_data = clean(get_blocks()) for line in block_data.split("\n"): l, name = line.split(";") start, stop = l.split("..") indices.append((int(start, 16), int(stop, 16))) blocks.append(name.strip()) def locate_block(code, left=0, right=len(indices)): """ Binary search on an ordered list of intervals. """ half = left + (right - left) // 2 [start, end] = indices[half] if start > code: return locate_block(code, left, right=half) elif end < code: return locate_block(code, half, right=right) else: return blocks[half] return locate_block def main(): """ Read the character and block data and unite them to a text file containing the following fields: ` ` seperated by tab characters. """ get_block = load_blocks() characters = clean(get_data()) logging.info("Parsing character data...") output = [] for line in characters.split("\n"): # Parse the needed data from the character's line attributes = line.strip().split(";") code = attributes[0] name = attributes[1] comment = attributes[10] # Convert character code to unicode try: num = int(code, 16) except ValueError: logging.warn("Could not convert " + code) continue # Find the character's block blk = get_block(num) if blk is not None: output.append("\t".join((name, comment, code, blk))) else: logging.warn("Code %s not found in any block, char: %s", num, unichr(num)) output.append(name + "\t" + comment + "\t" + code + "\t") with open("unicode_list.txt", "w") as target: target.write("\n".join(output)) if __name__ == "__main__": main()