Changed the character block search algo to binary search

This commit is contained in:
Vangelis Kostalas 2019-05-17 14:02:46 +03:00
parent ae19a664f6
commit 64f644162b

View file

@ -23,11 +23,11 @@ def get_blocks():
def get_data(): def get_data():
""" Download the info file for Unicode blocks. """ Download the info file for Unicode blocks.
""" """
logging.info("Downloading character data...")
req = request.urlopen( req = request.urlopen(
"https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt" "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
) )
content = req.read().decode() content = req.read().decode()
logging.info("Downloading character data...")
logging.info("Done") logging.info("Done")
return content return content
@ -53,26 +53,34 @@ def load_blocks():
indices.append((int(start, 16), int(stop, 16))) indices.append((int(start, 16), int(stop, 16)))
blocks.append(name.strip()) blocks.append(name.strip())
def locate_block(code): def locate_block(code, left=0, right=len(indices)):
for index, [start, stop] in enumerate(indices): """
if code > stop: Binary search on an ordered list of intervals.
continue """
else: half = left + (right - left) // 2
if code >= start: [start, end] = indices[half]
return blocks[index] if start > code:
return locate_block(code, left, right=half)
elif end < code:
return locate_block(code, half, right=right)
else:
return blocks[half]
return locate_block return locate_block
def main(): def main():
""" Read the character and block data and unite them to a text file containing the following fields:
`<character name> <character comment> <code> <block name>`
seperated by tab characters.
"""
get_block = load_blocks() get_block = load_blocks()
characters = clean(get_data()) characters = clean(get_data())
logging.info("Parsing character data...") logging.info("Parsing character data...")
output = [] output = []
for line in characters.split("\n"): for line in characters.split("\n"):
# Parse the needed data # Parse the needed data from the character's line
attributes = line.strip().split(";") attributes = line.strip().split(";")
code = attributes[0] code = attributes[0]
name = attributes[1] name = attributes[1]