Changed the character block search algo to binary search
This commit is contained in:
parent
ae19a664f6
commit
64f644162b
|
@ -23,11 +23,11 @@ def get_blocks():
|
||||||
def get_data():
|
def get_data():
|
||||||
""" Download the info file for Unicode blocks.
|
""" Download the info file for Unicode blocks.
|
||||||
"""
|
"""
|
||||||
|
logging.info("Downloading character data...")
|
||||||
req = request.urlopen(
|
req = request.urlopen(
|
||||||
"https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
|
"https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
|
||||||
)
|
)
|
||||||
content = req.read().decode()
|
content = req.read().decode()
|
||||||
logging.info("Downloading character data...")
|
|
||||||
logging.info("Done")
|
logging.info("Done")
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
@ -53,26 +53,34 @@ def load_blocks():
|
||||||
indices.append((int(start, 16), int(stop, 16)))
|
indices.append((int(start, 16), int(stop, 16)))
|
||||||
blocks.append(name.strip())
|
blocks.append(name.strip())
|
||||||
|
|
||||||
def locate_block(code):
|
def locate_block(code, left=0, right=len(indices)):
|
||||||
for index, [start, stop] in enumerate(indices):
|
"""
|
||||||
if code > stop:
|
Binary search on an ordered list of intervals.
|
||||||
continue
|
"""
|
||||||
else:
|
half = left + (right - left) // 2
|
||||||
if code >= start:
|
[start, end] = indices[half]
|
||||||
return blocks[index]
|
if start > code:
|
||||||
|
return locate_block(code, left, right=half)
|
||||||
|
elif end < code:
|
||||||
|
return locate_block(code, half, right=right)
|
||||||
|
else:
|
||||||
|
return blocks[half]
|
||||||
|
|
||||||
return locate_block
|
return locate_block
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
""" Read the character and block data and unite them to a text file containing the following fields:
|
||||||
|
`<character name> <character comment> <code> <block name>`
|
||||||
|
seperated by tab characters.
|
||||||
|
"""
|
||||||
get_block = load_blocks()
|
get_block = load_blocks()
|
||||||
characters = clean(get_data())
|
characters = clean(get_data())
|
||||||
|
|
||||||
logging.info("Parsing character data...")
|
logging.info("Parsing character data...")
|
||||||
|
|
||||||
output = []
|
output = []
|
||||||
for line in characters.split("\n"):
|
for line in characters.split("\n"):
|
||||||
# Parse the needed data
|
# Parse the needed data from the character's line
|
||||||
attributes = line.strip().split(";")
|
attributes = line.strip().split(";")
|
||||||
code = attributes[0]
|
code = attributes[0]
|
||||||
name = attributes[1]
|
name = attributes[1]
|
||||||
|
|
Loading…
Reference in a new issue