rec_vit_parseq.yml 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. Global:
  2. use_gpu: True
  3. epoch_num: 20
  4. log_smooth_window: 20
  5. print_batch_step: 5
  6. save_model_dir: ./output/rec/parseq
  7. save_epoch_step: 3
  8. # evaluation is run every 5000 iterations after the 4000th iteration
  9. eval_batch_step: [0, 500]
  10. cal_metric_during_train: True
  11. pretrained_model:
  12. checkpoints:
  13. save_inference_dir:
  14. use_visualdl: False
  15. infer_img: doc/imgs_words_en/word_10.png
  16. # for data or label process
  17. character_dict_path: ppocr/utils/dict/parseq_dict.txt
  18. character_type: en
  19. max_text_length: 25
  20. num_heads: 8
  21. infer_mode: False
  22. use_space_char: False
  23. save_res_path: ./output/rec/predicts_parseq.txt
  24. Optimizer:
  25. name: Adam
  26. beta1: 0.9
  27. beta2: 0.999
  28. lr:
  29. name: OneCycle
  30. max_lr: 0.0007
  31. Architecture:
  32. model_type: rec
  33. algorithm: ParseQ
  34. in_channels: 3
  35. Transform:
  36. Backbone:
  37. name: ViTParseQ
  38. img_size: [32, 128]
  39. patch_size: [4, 8]
  40. embed_dim: 384
  41. depth: 12
  42. num_heads: 6
  43. mlp_ratio: 4
  44. in_channels: 3
  45. Head:
  46. name: ParseQHead
  47. # Architecture
  48. max_text_length: 25
  49. embed_dim: 384
  50. dec_num_heads: 12
  51. dec_mlp_ratio: 4
  52. dec_depth: 1
  53. # Training
  54. perm_num: 6
  55. perm_forward: true
  56. perm_mirrored: true
  57. dropout: 0.1
  58. # Decoding mode (test)
  59. decode_ar: true
  60. refine_iters: 1
  61. Loss:
  62. name: ParseQLoss
  63. PostProcess:
  64. name: ParseQLabelDecode
  65. Metric:
  66. name: RecMetric
  67. main_indicator: acc
  68. is_filter: True
  69. Train:
  70. dataset:
  71. name: LMDBDataSet
  72. data_dir:
  73. transforms:
  74. - DecodeImage: # load image
  75. img_mode: BGR
  76. channel_first: False
  77. - ParseQRecAug:
  78. aug_type: 0 # or 1
  79. - ParseQLabelEncode:
  80. - SVTRRecResizeImg:
  81. image_shape: [3, 32, 128]
  82. padding: False
  83. - KeepKeys:
  84. keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
  85. loader:
  86. shuffle: True
  87. batch_size_per_card: 192
  88. drop_last: True
  89. num_workers: 4
  90. Eval:
  91. dataset:
  92. name: LMDBDataSet
  93. data_dir:
  94. transforms:
  95. - DecodeImage: # load image
  96. img_mode: BGR
  97. channel_first: False
  98. - ParseQLabelEncode: # Class handling label
  99. - SVTRRecResizeImg:
  100. image_shape: [3, 32, 128]
  101. padding: False
  102. - KeepKeys:
  103. keep_keys: ['image', 'label', 'length']
  104. loader:
  105. shuffle: False
  106. drop_last: False
  107. batch_size_per_card: 384
  108. num_workers: 4