From 64f644162b001ab2c491ce816abf437459720ef3 Mon Sep 17 00:00:00 2001 From: Vangelis Kostalas Date: Fri, 17 May 2019 14:02:46 +0300 Subject: [PATCH] Changed the character block search algo to binary search --- generate_character_list.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/generate_character_list.py b/generate_character_list.py index b256d7f..efbe13f 100644 --- a/generate_character_list.py +++ b/generate_character_list.py @@ -23,11 +23,11 @@ def get_blocks(): def get_data(): """ Download the info file for Unicode blocks. """ + logging.info("Downloading character data...") req = request.urlopen( "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt" ) content = req.read().decode() - logging.info("Downloading character data...") logging.info("Done") return content @@ -53,26 +53,34 @@ def load_blocks(): indices.append((int(start, 16), int(stop, 16))) blocks.append(name.strip()) - def locate_block(code): - for index, [start, stop] in enumerate(indices): - if code > stop: - continue - else: - if code >= start: - return blocks[index] + def locate_block(code, left=0, right=len(indices)): + """ + Binary search on an ordered list of intervals. + """ + half = left + (right - left) // 2 + [start, end] = indices[half] + if start > code: + return locate_block(code, left, right=half) + elif end < code: + return locate_block(code, half, right=right) + else: + return blocks[half] return locate_block def main(): + """ Read the character and block data and unite them to a text file containing the following fields: + ` ` + seperated by tab characters. + """ get_block = load_blocks() characters = clean(get_data()) logging.info("Parsing character data...") - output = [] for line in characters.split("\n"): - # Parse the needed data + # Parse the needed data from the character's line attributes = line.strip().split(";") code = attributes[0] name = attributes[1]