carrot/tinygrad_repo/examples/self_tokenize.py

import os, pathlib
from examples.llama3 import Tokenizer
from tabulate import tabulate
from tinygrad import fetch
from tinygrad.helpers import flatten

# llama 3 tokenizer
tokenizer = Tokenizer(fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model").as_posix())

def read_code(base_path):
  ret = []
  for path, _, files in os.walk(os.path.join(base_path, "tinygrad")):
    for name in files:
      if not name.endswith(".py"): continue
      if 'tinygrad/runtime/autogen' in path.replace('\\', '/'): continue
      fullpath = os.path.join(path, name)
      code = pathlib.Path(fullpath).read_text()
      ret += [(fullpath.split("tinygrad/", 1)[1], code)]
  return ret

if __name__ == "__main__":
  ret = read_code(".")

  table = []
  for name,code in ret:
    table.append([name, len(tokenizer.encode(name+"\x00"+code))])
  print(tabulate([["name", "llm tokens"]]+sorted(table, key=lambda x: -x[1]), headers="firstrow"))

  code_str = '\x00'.join(flatten(ret))
  print(f"code has {len(code_str)} chars")
  newline_count = code_str.count('\n')
  print(f"code has {newline_count} newlines")

  encoded = tokenizer.encode(code_str)
  print(f"code has {len(encoded)} tokens")