carrot/tinygrad_repo/examples/self_tokenize.py

import os, pathlib, argparse
from examples.llama3 import Tokenizer
from tabulate import tabulate
from tinygrad import fetch
from tinygrad.helpers import flatten

# llama 3 tokenizer
tokenizer = Tokenizer(fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model").as_posix())

def read_code(base_path):
  ret = []
  for path, _, files in os.walk(os.path.join(base_path, "tinygrad")):
    for name in files:
      if not name.endswith(".py"): continue
      if 'tinygrad/runtime/autogen' in path.replace('\\', '/'): continue
      fullpath = os.path.join(path, name)
      code = pathlib.Path(fullpath).read_text()
      ret.append(("### " + fullpath.split("tinygrad/", 1)[1], code))
  return ret

def write_code_to_file(filename, code_list):
  """Writes the combined code to a specified file."""
  with open(filename, 'w') as f:
    f.write('\n'.join(flatten(code_list)))

if __name__ == "__main__":
  parser = argparse.ArgumentParser(description="Analyze and optionally save tinygrad code.")
  parser.add_argument("--output", help="Output file to write the combined code to.")
  args = parser.parse_args()

  ret = read_code(".")

  table = []
  for name,code in ret:
    table.append([name, len(tokenizer.encode(name+"\x00"+code))])
  print(tabulate([["name", "llm tokens"]]+sorted(table, key=lambda x: -x[1]), headers="firstrow"))

  code_str = '\x00'.join(flatten(ret))
  print(f"code has {len(code_str)} chars")
  newline_count = code_str.count('\n')
  print(f"code has {newline_count} newlines")

  encoded = tokenizer.encode(code_str)
  print(f"code has {len(encoded)} tokens")

  if args.output:
    write_code_to_file(args.output, ret)
    print(f"Combined code written to {args.output}")
update 250418 2025-04-18 20:38:55 +09:00			`import os, pathlib, argparse`
openpilot v0.9.9 release date: 2025-03-08T09:09:29 master commit: ce355250be726f9bc8f0ac165a6cde41586a983d 2025-03-08 09:09:31 +00:00			`from examples.llama3 import Tokenizer`
			`from tabulate import tabulate`
			`from tinygrad import fetch`
			`from tinygrad.helpers import flatten`

			`# llama 3 tokenizer`
			`tokenizer = Tokenizer(fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model").as_posix())`

			`def read_code(base_path):`
			`ret = []`
			`for path, _, files in os.walk(os.path.join(base_path, "tinygrad")):`
			`for name in files:`
			`if not name.endswith(".py"): continue`
			`if 'tinygrad/runtime/autogen' in path.replace('\\', '/'): continue`
			`fullpath = os.path.join(path, name)`
			`code = pathlib.Path(fullpath).read_text()`
update 250418 2025-04-18 20:38:55 +09:00			`ret.append(("### " + fullpath.split("tinygrad/", 1)[1], code))`
openpilot v0.9.9 release date: 2025-03-08T09:09:29 master commit: ce355250be726f9bc8f0ac165a6cde41586a983d 2025-03-08 09:09:31 +00:00			`return ret`

update 250418 2025-04-18 20:38:55 +09:00			`def write_code_to_file(filename, code_list):`
			`"""Writes the combined code to a specified file."""`
			`with open(filename, 'w') as f:`
			`f.write('\n'.join(flatten(code_list)))`

openpilot v0.9.9 release date: 2025-03-08T09:09:29 master commit: ce355250be726f9bc8f0ac165a6cde41586a983d 2025-03-08 09:09:31 +00:00			`if __name__ == "__main__":`
update 250418 2025-04-18 20:38:55 +09:00			`parser = argparse.ArgumentParser(description="Analyze and optionally save tinygrad code.")`
			`parser.add_argument("--output", help="Output file to write the combined code to.")`
			`args = parser.parse_args()`

openpilot v0.9.9 release date: 2025-03-08T09:09:29 master commit: ce355250be726f9bc8f0ac165a6cde41586a983d 2025-03-08 09:09:31 +00:00			`ret = read_code(".")`

			`table = []`
			`for name,code in ret:`
			`table.append([name, len(tokenizer.encode(name+"\x00"+code))])`
			`print(tabulate([["name", "llm tokens"]]+sorted(table, key=lambda x: -x[1]), headers="firstrow"))`

			`code_str = '\x00'.join(flatten(ret))`
			`print(f"code has {len(code_str)} chars")`
			`newline_count = code_str.count('\n')`
			`print(f"code has {newline_count} newlines")`

			`encoded = tokenizer.encode(code_str)`
			`print(f"code has {len(encoded)} tokens")`
update 250418 2025-04-18 20:38:55 +09:00
			`if args.output:`
			`write_code_to_file(args.output, ret)`
			`print(f"Combined code written to {args.output}")`