#!/usr/bin/env python2.7 """ Copyleft 2013 Osama Khalid. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This tool extracts HTTPSEverywhere rulesets and converts any given URL to the secure version if it exists, or it returns None. Here is a sample: >>> import find_rules >>> replacer = find_rules.FindRules("/path/to/default.rulesets") >>> replacer.find("http://en.wikipedia.org/") 'https://en.wikipedia.org/' >>> replacer.find("http://en.wikipedi.org/") # With a typo >>> """ import re import xml.etree.ElementTree as ET class FindRules: def __init__(self, filename): self.extract_rulesets(filename) def verify_target(self, target, host): matching_target = target.strip("*.") matching_target = matching_target.strip(".*") if target.startswith("*."): if host.endswith(matching_target): #print target, "matches", host return True elif target.endswith(".*"): if host.startswith(matching_target): #print target, "matches", host return True else: if host == matching_target: #print target, "matches", host return True def convert_to_python(self, matching, replacement): """Instead of $1 that is used by Javascript, Python uses \1.""" new_matching = matching.replace(")?", "|)") # to avoid "unmatched group" error new_replacement = re.sub(r"\$(\d)", r"\\g<\1>", replacement) return new_matching, new_replacement def extract_rulesets(self, filename): tree = ET.parse(filename) root = tree.getroot() self.dict = {} for child in root: if child.tag == "ruleset": if "default_off" in child.attrib: continue ruleset_name = child.attrib['name'] ruleset = child.getchildren() self.dict[ruleset_name] = {} self.dict[ruleset_name]['targets'] = [] self.dict[ruleset_name]['rules'] = [] self.dict[ruleset_name]['exclusions'] = [] for rule in ruleset: if rule.tag == "target": self.dict[ruleset_name]['targets'].append(rule.attrib['host']) if rule.tag == "rule": self.dict[ruleset_name]['rules'].append((rule.attrib['from'], rule.attrib['to'])) if rule.tag == "exclusion": self.dict[ruleset_name]['exclusions'].append(rule.attrib['pattern']) def find(self, url): hostname_regex = r"https?://([^/]+)" try: #Remove host = re.findall(hostname_regex, url)[0] except IndexError, e: print url raise IndexError, e # In HTTPSEverywhere, URLs must contain a '/'. if url.replace("http://", "").find("/") == -1: url += "/" for ruleset in self.dict: for target in self.dict[ruleset]['targets']: if self.verify_target(target, host): for exclusion in self.dict[ruleset]['exclusions']: if re.findall(exclusion, url): return None for rule in self.dict[ruleset]['rules']: matching_regex = rule[0] # "from" replacement_regex = rule[1] # "to" new_matching, new_replacement = self.convert_to_python(matching_regex, replacement_regex) try: replace_url = re.sub(new_matching, new_replacement, url) except re.error, e: print new_matching, new_replacement, url raise re.error, e if url != replace_url: return replace_url return None if __name__ == "__main__": import sys filename = sys.argv[1] url = sys.argv[2] script = FindRules(filename) replaced_url = script.find(url) print replaced_url