diff options
Diffstat (limited to '.local/bin/srt-deduplicate')
-rwxr-xr-x | .local/bin/srt-deduplicate | 96 |
1 files changed, 96 insertions, 0 deletions
diff --git a/.local/bin/srt-deduplicate b/.local/bin/srt-deduplicate new file mode 100755 index 0000000..8fe8f63 --- /dev/null +++ b/.local/bin/srt-deduplicate @@ -0,0 +1,96 @@ +#!/usr/local/bin/python3 + +"""Deduplicate repeated subtitles.""" + +import datetime +import srt_tools.utils +import logging +import operator + +log = logging.getLogger(__name__) + +try: # Python 2 + range = xrange # pytype: disable=name-error +except NameError: + pass + + +def parse_args(): + examples = { + "Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt", + "Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt", + "Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt", + } + parser = srt_tools.utils.basic_parser( + description=__doc__, + examples=examples, + ) + parser.add_argument( + "-t", + "--ms", + metavar="MILLISECONDS", + default=datetime.timedelta(milliseconds=5000), + type=lambda ms: datetime.timedelta(milliseconds=int(ms)), + help="how many milliseconds distance a subtitle start time must be " + "within of another to be considered a duplicate " + "(default: 5000ms)", + ) + + return parser.parse_args() + + +def deduplicate_subs(orig_subs, acceptable_diff): + """Remove subtitles with duplicated content.""" + indices_to_remove = [] + + # If we only store the subtitle itself and compare that, it's possible that + # we'll not only remove the duplicate, but also the _original_ subtitle if + # they have the same sub index/times/etc. + # + # As such, we need to also store the index in the original subs list that + # this entry belongs to for each subtitle prior to sorting. + sorted_subs = sorted( + enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start) + ) + + for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False): + cur_idx, cur_sub = subs[0] + next_idx, next_sub = subs[1] + + if cur_sub.content == next_sub.content and ( + not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start + ): + log.debug( + "Marking l%d/s%d for removal, duplicate of l%d/s%d", + next_idx, + next_sub.index, + cur_idx, + cur_sub.index, + ) + indices_to_remove.append(next_idx) + + offset = 0 + for idx in indices_to_remove: + del orig_subs[idx - offset] + offset += 1 + + +def main(): + args = parse_args() + logging.basicConfig(level=args.log_level) + + srt_tools.utils.set_basic_args(args) + + subs = list(args.input) + deduplicate_subs(subs, args.ms) + + output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict) + + try: + args.output.write(output) + except (UnicodeEncodeError, TypeError): # Python 2 fallback + args.output.write(output.encode(args.encoding)) + + +if __name__ == "__main__": # pragma: no cover + main() |