summaryrefslogtreecommitdiffstats
path: root/.local/bin/srt-deduplicate
diff options
context:
space:
mode:
Diffstat (limited to '.local/bin/srt-deduplicate')
-rwxr-xr-x.local/bin/srt-deduplicate96
1 files changed, 96 insertions, 0 deletions
diff --git a/.local/bin/srt-deduplicate b/.local/bin/srt-deduplicate
new file mode 100755
index 0000000..8fe8f63
--- /dev/null
+++ b/.local/bin/srt-deduplicate
@@ -0,0 +1,96 @@
+#!/usr/local/bin/python3
+
+"""Deduplicate repeated subtitles."""
+
+import datetime
+import srt_tools.utils
+import logging
+import operator
+
+log = logging.getLogger(__name__)
+
+try: # Python 2
+ range = xrange # pytype: disable=name-error
+except NameError:
+ pass
+
+
+def parse_args():
+ examples = {
+ "Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt",
+ "Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt",
+ "Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt",
+ }
+ parser = srt_tools.utils.basic_parser(
+ description=__doc__,
+ examples=examples,
+ )
+ parser.add_argument(
+ "-t",
+ "--ms",
+ metavar="MILLISECONDS",
+ default=datetime.timedelta(milliseconds=5000),
+ type=lambda ms: datetime.timedelta(milliseconds=int(ms)),
+ help="how many milliseconds distance a subtitle start time must be "
+ "within of another to be considered a duplicate "
+ "(default: 5000ms)",
+ )
+
+ return parser.parse_args()
+
+
+def deduplicate_subs(orig_subs, acceptable_diff):
+ """Remove subtitles with duplicated content."""
+ indices_to_remove = []
+
+ # If we only store the subtitle itself and compare that, it's possible that
+ # we'll not only remove the duplicate, but also the _original_ subtitle if
+ # they have the same sub index/times/etc.
+ #
+ # As such, we need to also store the index in the original subs list that
+ # this entry belongs to for each subtitle prior to sorting.
+ sorted_subs = sorted(
+ enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start)
+ )
+
+ for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False):
+ cur_idx, cur_sub = subs[0]
+ next_idx, next_sub = subs[1]
+
+ if cur_sub.content == next_sub.content and (
+ not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start
+ ):
+ log.debug(
+ "Marking l%d/s%d for removal, duplicate of l%d/s%d",
+ next_idx,
+ next_sub.index,
+ cur_idx,
+ cur_sub.index,
+ )
+ indices_to_remove.append(next_idx)
+
+ offset = 0
+ for idx in indices_to_remove:
+ del orig_subs[idx - offset]
+ offset += 1
+
+
+def main():
+ args = parse_args()
+ logging.basicConfig(level=args.log_level)
+
+ srt_tools.utils.set_basic_args(args)
+
+ subs = list(args.input)
+ deduplicate_subs(subs, args.ms)
+
+ output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict)
+
+ try:
+ args.output.write(output)
+ except (UnicodeEncodeError, TypeError): # Python 2 fallback
+ args.output.write(output.encode(args.encoding))
+
+
+if __name__ == "__main__": # pragma: no cover
+ main()