wordmatcher.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. # -*- coding: utf-8 -*-
  2. # *****************************************************************************
  3. # Copyright (C) 2006-2020 Jorgen Stenarson. <jorgen.stenarson@bostream.nu>
  4. # Copyright (C) 2020 Bassem Girgis. <brgirgis@gmail.com>
  5. #
  6. # Distributed under the terms of the BSD License. The full license is in
  7. # the file COPYING, distributed as part of this software.
  8. # *****************************************************************************
  9. import re
  10. def str_find_all(in_str, ch):
  11. result = []
  12. index = 0
  13. while index >= 0:
  14. index = in_str.find(ch, index)
  15. if index >= 0:
  16. result.append(index)
  17. index += 1
  18. return result
  19. word_pattern = re.compile("(x*)")
  20. def markwords(in_str, is_wordfun):
  21. markers = {True: "x", False: "o"}
  22. return "".join([markers[is_wordfun(ch)] for ch in in_str])
  23. def split_words(in_str, is_wordfun):
  24. return [x for x in word_pattern.split(markwords(in_str, is_wordfun)) if x != ""]
  25. def mark_start_segment(in_str, is_segment):
  26. def mark_start(s):
  27. if s[0:1] == "x":
  28. return "s" + s[1:]
  29. else:
  30. return s
  31. return "".join(map(mark_start, split_words(in_str, is_segment)))
  32. def mark_end_segment(in_str, is_segment):
  33. def mark_start(s):
  34. if s[0:1] == "x":
  35. return s[:-1] + "s"
  36. else:
  37. return s
  38. return "".join(map(mark_start, split_words(in_str, is_segment)))
  39. def mark_start_segment_index(in_str, is_segment):
  40. return str_find_all(mark_start_segment(in_str, is_segment), "s")
  41. def mark_end_segment_index(in_str, is_segment):
  42. return [x + 1 for x in str_find_all(mark_end_segment(in_str, is_segment), "s")]
  43. # ############### Following are used in lineobj ###########################
  44. def is_word_token(in_str):
  45. return not is_non_word_token(in_str)
  46. def is_non_word_token(in_str):
  47. if len(in_str) != 1 or in_str in " \t\n":
  48. return True
  49. else:
  50. return False
  51. def next_start_segment(in_str, is_segment):
  52. in_str = "".join(in_str)
  53. result = []
  54. for start in mark_start_segment_index(in_str, is_segment):
  55. result[len(result) : start] = [start for x in range(start - len(result))]
  56. result[len(result) : len(in_str)] = [
  57. len(in_str) for x in range(len(in_str) - len(result) + 1)
  58. ]
  59. return result
  60. def next_end_segment(in_str, is_segment):
  61. in_str = "".join(in_str)
  62. result = []
  63. for start in mark_end_segment_index(in_str, is_segment):
  64. result[len(result) : start] = [start for x in range(start - len(result))]
  65. result[len(result) : len(in_str)] = [
  66. len(in_str) for x in range(len(in_str) - len(result) + 1)
  67. ]
  68. return result
  69. def prev_start_segment(in_str, is_segment):
  70. in_str = "".join(in_str)
  71. result = []
  72. prev = 0
  73. for start in mark_start_segment_index(in_str, is_segment):
  74. result[len(result) : start + 1] = [prev for x in range(start - len(result) + 1)]
  75. prev = start
  76. result[len(result) : len(in_str)] = [
  77. prev for x in range(len(in_str) - len(result) + 1)
  78. ]
  79. return result
  80. def prev_end_segment(in_str, is_segment):
  81. in_str = "".join(in_str)
  82. result = []
  83. prev = 0
  84. for start in mark_end_segment_index(in_str, is_segment):
  85. result[len(result) : start + 1] = [prev for x in range(start - len(result) + 1)]
  86. prev = start
  87. result[len(result) : len(in_str)] = [
  88. len(in_str) for x in range(len(in_str) - len(result) + 1)
  89. ]
  90. return result