preprocess.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. # Copyright (c) Alibaba, Inc. and its affiliates.
  2. import glob
  3. import os
  4. from modelscope.preprocessors.nlp.space.fields.intent_field import \
  5. IntentBPETextField
  6. FILE_NAME = 'train.json'
  7. def intent_preprocess(path, cfg):
  8. bpe = IntentBPETextField(path, cfg)
  9. args = cfg.Dataset
  10. build_examples_fn = bpe.build_examples_multi_turn if args.trigger_role == 'system' \
  11. else bpe.build_examples_single_turn
  12. build_score_matrix_fn = bpe.build_score_matrix
  13. build_score_matrix_multiprocessing_fn = bpe.build_score_matrix_multiprocessing
  14. data_paths = list(
  15. os.path.dirname(c) for c in sorted(
  16. glob.glob(args.data_dir + '/**/' + FILE_NAME, recursive=True)))
  17. data_paths = bpe.filter_data_path(data_paths=data_paths)
  18. for mode in ['train', 'valid', 'test']:
  19. for data_path in data_paths:
  20. input_file = os.path.join(data_path, f'{mode}.json')
  21. output_file = os.path.join(data_path,
  22. f'{mode}.{bpe.tokenizer_type}.jsonl')
  23. output_score_file = os.path.join(data_path, f'{mode}.Score.npy')
  24. if os.path.exists(input_file) and not os.path.exists(output_file):
  25. examples = build_examples_fn(input_file, data_type=mode)
  26. if examples:
  27. bpe.save_examples(examples, output_file)
  28. else:
  29. continue
  30. if os.path.exists(output_file) and not os.path.exists(output_score_file) and \
  31. not args.dynamic_score and 'AnPreDial' in data_path:
  32. examples = bpe.load_examples(output_file)
  33. if args.num_process >= 2:
  34. score_matrix = build_score_matrix_multiprocessing_fn(
  35. examples)
  36. else:
  37. score_matrix = build_score_matrix_fn(examples)
  38. bpe.save_examples(score_matrix, output_score_file)