Refactred the data file generation proccess
This commit is contained in:
		
							parent
							
								
									f607e06d2e
								
							
						
					
					
						commit
						3768d39ab3
					
				
					 1 changed files with 80 additions and 29 deletions
				
			
		| 
						 | 
					@ -2,15 +2,52 @@
 | 
				
			||||||
Download the latest unicode tables from  https://www.unicode.org and create a .txt file
 | 
					Download the latest unicode tables from  https://www.unicode.org and create a .txt file
 | 
				
			||||||
containing all the names, blocks and character codes
 | 
					containing all the names, blocks and character codes
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import logging
 | 
				
			||||||
 | 
					from urllib import request
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					curr_path = os.path.dirname(__file__)
 | 
				
			||||||
 | 
					logging.basicConfig(level=logging.DEBUG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def main():
 | 
					def get_blocks():
 | 
				
			||||||
 | 
					    """ Download the info file for Unicode blocks.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    logging.info("Downloading block data...")
 | 
				
			||||||
 | 
					    req = request.urlopen("https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt")
 | 
				
			||||||
 | 
					    content = req.read().decode()
 | 
				
			||||||
 | 
					    logging.info("Done")
 | 
				
			||||||
 | 
					    return content
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_data():
 | 
				
			||||||
 | 
					    """ Download the info file for Unicode blocks.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    logging.info("Downloading block data...")
 | 
				
			||||||
 | 
					    req = request.urlopen(
 | 
				
			||||||
 | 
					        "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    content = req.read().decode()
 | 
				
			||||||
 | 
					    logging.info("Done")
 | 
				
			||||||
 | 
					    return content
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def clean(text):
 | 
				
			||||||
 | 
					    """ Remove all blank or commented lies from a string
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    lines = text.strip().split("\n")
 | 
				
			||||||
 | 
					    clean_lines = [line.strip() for line in lines if line.strip() and line[0] != "#"]
 | 
				
			||||||
 | 
					    return "\n".join(clean_lines)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def load_blocks():
 | 
				
			||||||
 | 
					    """ Load and parse the block data and return a function that provides block
 | 
				
			||||||
 | 
					    search based on a character code.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    indices = []
 | 
					    indices = []
 | 
				
			||||||
    blocks = []
 | 
					    blocks = []
 | 
				
			||||||
    with open("Blocks.txt", "r") as block_file:
 | 
					    block_data = clean(get_blocks())
 | 
				
			||||||
        for line in block_file.readlines():
 | 
					    for line in block_data.split("\n"):
 | 
				
			||||||
            if line.startswith("#"):
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
        l, name = line.split(";")
 | 
					        l, name = line.split(";")
 | 
				
			||||||
        start, stop = l.split("..")
 | 
					        start, stop = l.split("..")
 | 
				
			||||||
        indices.append((int(start, 16), int(stop, 16)))
 | 
					        indices.append((int(start, 16), int(stop, 16)))
 | 
				
			||||||
| 
						 | 
					@ -22,28 +59,42 @@ def main():
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                if code >= start:
 | 
					                if code >= start:
 | 
				
			||||||
                    return index
 | 
					                    return blocks[index]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with open("unicode_list.txt", "w") as target:
 | 
					    return locate_block
 | 
				
			||||||
        with open("UnicodeData.txt", "r") as names:
 | 
					
 | 
				
			||||||
            for line in names.readlines():
 | 
					
 | 
				
			||||||
 | 
					def main():
 | 
				
			||||||
 | 
					    get_block = load_blocks()
 | 
				
			||||||
 | 
					    characters = clean(get_data())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    logging.info("Parsing character data...")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    output = []
 | 
				
			||||||
 | 
					    for line in characters.split("\n"):
 | 
				
			||||||
 | 
					        # Parse the needed data
 | 
				
			||||||
        attributes = line.strip().split(";")
 | 
					        attributes = line.strip().split(";")
 | 
				
			||||||
        code = attributes[0]
 | 
					        code = attributes[0]
 | 
				
			||||||
        name = attributes[1]
 | 
					        name = attributes[1]
 | 
				
			||||||
        comment = attributes[10]
 | 
					        comment = attributes[10]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Convert character code to unicode
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            num = int(code, 16)
 | 
					            num = int(code, 16)
 | 
				
			||||||
        except ValueError:
 | 
					        except ValueError:
 | 
				
			||||||
                    print("could not convert " + code)
 | 
					            logging.warn("Could not convert " + code)
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
                index = locate_block(num)
 | 
					
 | 
				
			||||||
                if index is not None:
 | 
					        # Find the character's block
 | 
				
			||||||
                    target.write(name + "\t" + comment + "\t" + code + "\t" + blocks[index] + "\n")
 | 
					        blk = get_block(num)
 | 
				
			||||||
 | 
					        if blk is not None:
 | 
				
			||||||
 | 
					            output.append("\t".join((name, comment, code, blk)))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
                    print(
 | 
					            logging.warn("Code %s not found in any block, char: %s", num, unichr(num))
 | 
				
			||||||
                        "Code " + str(num) + " not found in block, char: " + unichr(num)
 | 
					            output.append(name + "\t" + comment + "\t" + code + "\t")
 | 
				
			||||||
                    )
 | 
					
 | 
				
			||||||
                    target.write(name + "\t" + comment + "\t" + code + "\t" + "\n")
 | 
					    with open("unicode_list.txt", "w") as target:
 | 
				
			||||||
 | 
					        target.write("\n".join(output))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Reference in a new issue