Refactred the data file generation proccess
This commit is contained in:
parent
f607e06d2e
commit
3768d39ab3
|
@ -2,19 +2,56 @@
|
|||
Download the latest unicode tables from https://www.unicode.org and create a .txt file
|
||||
containing all the names, blocks and character codes
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
from urllib import request
|
||||
|
||||
curr_path = os.path.dirname(__file__)
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
|
||||
def main():
|
||||
def get_blocks():
|
||||
""" Download the info file for Unicode blocks.
|
||||
"""
|
||||
logging.info("Downloading block data...")
|
||||
req = request.urlopen("https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt")
|
||||
content = req.read().decode()
|
||||
logging.info("Done")
|
||||
return content
|
||||
|
||||
|
||||
def get_data():
|
||||
""" Download the info file for Unicode blocks.
|
||||
"""
|
||||
logging.info("Downloading block data...")
|
||||
req = request.urlopen(
|
||||
"https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
|
||||
)
|
||||
content = req.read().decode()
|
||||
logging.info("Done")
|
||||
return content
|
||||
|
||||
|
||||
def clean(text):
|
||||
""" Remove all blank or commented lies from a string
|
||||
"""
|
||||
lines = text.strip().split("\n")
|
||||
clean_lines = [line.strip() for line in lines if line.strip() and line[0] != "#"]
|
||||
return "\n".join(clean_lines)
|
||||
|
||||
|
||||
def load_blocks():
|
||||
""" Load and parse the block data and return a function that provides block
|
||||
search based on a character code.
|
||||
"""
|
||||
indices = []
|
||||
blocks = []
|
||||
with open("Blocks.txt", "r") as block_file:
|
||||
for line in block_file.readlines():
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
l, name = line.split(";")
|
||||
start, stop = l.split("..")
|
||||
indices.append((int(start, 16), int(stop, 16)))
|
||||
blocks.append(name.strip())
|
||||
block_data = clean(get_blocks())
|
||||
for line in block_data.split("\n"):
|
||||
l, name = line.split(";")
|
||||
start, stop = l.split("..")
|
||||
indices.append((int(start, 16), int(stop, 16)))
|
||||
blocks.append(name.strip())
|
||||
|
||||
def locate_block(code):
|
||||
for index, [start, stop] in enumerate(indices):
|
||||
|
@ -22,28 +59,42 @@ def main():
|
|||
continue
|
||||
else:
|
||||
if code >= start:
|
||||
return index
|
||||
return blocks[index]
|
||||
|
||||
return locate_block
|
||||
|
||||
|
||||
def main():
|
||||
get_block = load_blocks()
|
||||
characters = clean(get_data())
|
||||
|
||||
logging.info("Parsing character data...")
|
||||
|
||||
output = []
|
||||
for line in characters.split("\n"):
|
||||
# Parse the needed data
|
||||
attributes = line.strip().split(";")
|
||||
code = attributes[0]
|
||||
name = attributes[1]
|
||||
comment = attributes[10]
|
||||
|
||||
# Convert character code to unicode
|
||||
try:
|
||||
num = int(code, 16)
|
||||
except ValueError:
|
||||
logging.warn("Could not convert " + code)
|
||||
continue
|
||||
|
||||
# Find the character's block
|
||||
blk = get_block(num)
|
||||
if blk is not None:
|
||||
output.append("\t".join((name, comment, code, blk)))
|
||||
else:
|
||||
logging.warn("Code %s not found in any block, char: %s", num, unichr(num))
|
||||
output.append(name + "\t" + comment + "\t" + code + "\t")
|
||||
|
||||
with open("unicode_list.txt", "w") as target:
|
||||
with open("UnicodeData.txt", "r") as names:
|
||||
for line in names.readlines():
|
||||
attributes = line.strip().split(";")
|
||||
code = attributes[0]
|
||||
name = attributes[1]
|
||||
comment = attributes[10]
|
||||
try:
|
||||
num = int(code, 16)
|
||||
except ValueError:
|
||||
print("could not convert " + code)
|
||||
continue
|
||||
index = locate_block(num)
|
||||
if index is not None:
|
||||
target.write(name + "\t" + comment + "\t" + code + "\t" + blocks[index] + "\n")
|
||||
else:
|
||||
print(
|
||||
"Code " + str(num) + " not found in block, char: " + unichr(num)
|
||||
)
|
||||
target.write(name + "\t" + comment + "\t" + code + "\t" + "\n")
|
||||
target.write("\n".join(output))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Reference in a new issue