ingest2.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. """
  2. Goals:
  3. + Use OKLab
  4. + Improved clustering logic
  5. + Parallel, in the same way as anim-ingest
  6. + Async requests for downloads
  7. + Include more info about the pokemon (form, display name, icon sprite source)
  8. + Include megas/gmax/etc, tagged so the UI can filter them
  9. * Include more images (get more stills from pokemondb + serebii)
  10. * Include shinies
  11. * Fallback automatically (try showdown animated, then showdown gen5, then pdb)
  12. * Filtering system more explicit and easier to work around
  13. * Output a record of ingest for auditing
  14. * Automatic retry of a partially failed ingest, using record
  15. """
  16. # async def load_image(session: ClientSession, url: str) -> Image.Image:
  17. # async with session.get(url) as res:
  18. # res.raise_for_status()
  19. # return Image.open(BytesIO(await res.read()))
  20. # async def load_all_images(urls: list[str]) -> tuple[list[Image.Image], list[Exception]]:
  21. # async with ClientSession() as session:
  22. # results = await asyncio.gather(
  23. # *(load_image(session, url) for url in urls),
  24. # return_exceptions=True
  25. # )
  26. # success = []
  27. # errors = []
  28. # for r in results:
  29. # (success if isinstance(r, Image.Image) else errors).append(r)
  30. # return success, errors
  31. # def get_urls(target: Pokemon, form: FormInfo) -> list[str]:
  32. # lower_name = form.name.lower()
  33. # return [
  34. # f"https://play.pokemonshowdown.com/sprites/ani/{lower_name}.gif",
  35. # f"https://play.pokemonshowdown.com/sprites/ani-back/{lower_name}.gif",
  36. # f"https://play.pokemonshowdown.com/sprites/gen5/{lower_name}.png",
  37. # f"https://play.pokemonshowdown.com/sprites/gen5-back/{lower_name}.png",
  38. # f"https://img.pokemondb.net/sprites/home/normal/{lower_name}.png",
  39. # # TODO other sources - want to make sure we never cross contaminate though...
  40. # # if we pull the wrong form for something it will be a nightmare to debug
  41. # # f"https://www.serebii.net/scarletviolet/pokemon/new/{target.num}-{???}.png"
  42. # # f"https://www.serebii.net/pokemon/art/{target.num}-{???}.png"
  43. # ]
  44. # async def set_data(target: Pokemon, seed=0) -> list[Exception]:
  45. # all_errors = []
  46. # for form in target.forms:
  47. # print(f" #{target.num} - Ingesting Form: {form.name}")
  48. # urls = get_urls(target, form)
  49. # print(f" #{target.num} - Attempting {len(urls)} potential sources")
  50. # images, errors = await load_all_images(urls)
  51. # all_errors.extend(errors)
  52. # print(f" #{target.num} - Loaded {len(images)} sources")
  53. # try:
  54. # pixels = np.concatenate([get_pixels(img) for img in images])
  55. # print(f" #{target.num} - Summarizing {len(pixels)} total pixels")
  56. # total = calc_statistics(pixels)
  57. # print(f" #{target.num} - Begin clustering")
  58. # clusters = find_clusters(pixels, seed=seed)
  59. # print(f" #{target.num} - End clustering, chose k={len(clusters)}")
  60. # form.data = Data(total=total, clusters=clusters)
  61. # except Exception as e:
  62. # all_errors.append(e)
  63. # return all_errors
  64. # async def ingest(pool_size: int, seed: int) -> tuple[list[str], list[str]]:
  65. # computed = []
  66. # errors = []
  67. # loop = asyncio.get_event_loop()
  68. # with ProcessPoolExecutor(pool_size) as exec:
  69. # print(f"Ingesting #{start} - #{end}")
  70. # for pkmn in pkdx[start - 1:end]:
  71. # print(f"Ingesting #{pkmn.num}: {pkmn.species}...")
  72. # new_errors = await set_data(pkmn, seed)
  73. # loop.run_in_executor(exec, set_data, pkmn, seed)
  74. # computed.append(loop.run_in_executor(pool, ingest(p)))
  75. # try:
  76. # errors.extend(new_errors)
  77. # print(f"Finished #{pkmn.num}: {len(new_errors)} error(s)")
  78. # return json.dumps(asdict(pkmn))
  79. # except Exception as e:
  80. # print(e)
  81. # errors.append(e)
  82. # if __name__ == "__main__":
  83. # from sys import argv
  84. # dex_file = argv[1] if len(argv) > 1 else "data/pokedex.json"
  85. # out_file = argv[2] if len(argv) > 2 else "data/database-latest.db"
  86. # dex_span = argv[3] if len(argv) > 3 else "1-151"
  87. # log_file = argv[4] if len(argv) > 4 else "errors-latest.log"
  88. # set_seed = argv[5] if len(argv) > 5 else "20230304"
  89. # start, end = map(int, dex_span.split("-", maxsplit=1))
  90. # seed = int(set_seed)
  91. # errors = []
  92. # pkdx = list(load_pokedex(dex_file))
  93. # loop = asyncio.new_event_loop()
  94. # with open(log_file, "w") as log:
  95. # # TODO better logging
  96. # log.writelines(str(e) for e in errors)
  97. # with open(out_file, "a") as db:
  98. # for _, line in computed:
  99. # db.write(line)
  100. # db.write("\n")