--- a/buses.py Thu Jun 22 19:01:31 2017 +0300 +++ b/buses.py Sat Jun 24 19:38:05 2017 +0300 @@ -154,294 +154,314 @@ services = {} bus_stops = {} all_clusters = set() - -print('Ladataan routes... ', file = stderr, end = '', flush = True) -with open('gtfs/routes.txt') as file: - for row in read_csv(file): - route = BusRoute(row) - routes[route.reference] = route - routes_per_id[route.id] = route -print('%d linjaa' % len(routes), file = stderr) - -print('Ladataan ajovuorot... ', file = stderr, end = '', flush = True) - -shape_distances = {} -with open('gtfs/shapes.txt') as file: - for row in read_csv(file): - shape_distances[row['shape_id']] = max(shape_distances.get(row['shape_id'], 0), float(row['shape_dist_traveled'])) - -with open('gtfs/trips.txt') as file: - for row in read_csv(file): - if row['service_id'] not in services: - services[row['service_id']] = BusService(row['service_id']) - route = routes_per_id[row['route_id']] - trip = BusTrip( - reference = row['trip_id'], - route = route, - service = services[row['service_id']], - length = shape_distances[row['shape_id']] - ) - route.trips.add(trip) - assert trip.name not in all_trips - all_trips[trip.name] = trip -print('%d ajoa' % len(all_trips), file = stderr) - -def read_date(teksti): - return date(int(teksti[:4]), int(teksti[4:6]), int(teksti[6:])) - -def read_time(teksti): - tunti, minuutti, sekunti = map(int, teksti.split(':')) - return timedelta(hours = tunti, minutes = minuutti, seconds = sekunti) - -print('Ladataan päiväykset... ', file = stderr, flush = True) - -viimeinen_käyttöpäivä = date.today() +viimeinen_käyttöpäivä = None +clusters_by_name = {} services_for_day = {} -def date_range(start_date, end_date, *, include_end = False): - ''' Generates date from start_date to end_date. If include_end is True, then end_date will be yielded. ''' - current_date = start_date - while current_date < end_date: - yield current_date - current_date += timedelta(1) - if include_end: - yield end_date +def load_buses(gtfs_zip_path, profile): + global viimeinen_käyttöpäivä + from zipfile import ZipFile + with ZipFile(gtfs_zip_path) as gtfs_zip: + print('Ladataan linjat... ', file = stderr, end = '', flush = True) + with gtfs_zip.open('routes.txt') as file: + for row in read_csv(map(bytes.decode, file)): + route = BusRoute(row) + routes[route.reference] = route + routes_per_id[route.id] = route + print('%d linjaa' % len(routes), file = stderr) + + print('Ladataan ajovuorot... ', file = stderr, end = '', flush = True) + + shape_distances = {} + with gtfs_zip.open('shapes.txt') as file: + for row in read_csv(map(bytes.decode, file)): + shape_distances[row['shape_id']] = max(shape_distances.get(row['shape_id'], 0), float(row['shape_dist_traveled'])) -def add_day_to_service(service_name, day): - try: - service = services[service_name] - except KeyError: - return - else: - service.dates.add(day) - if day not in services_for_day: - services_for_day[day] = set() - services_for_day[day].add(service) - global viimeinen_käyttöpäivä - viimeinen_käyttöpäivä = max(day, viimeinen_käyttöpäivä) + with gtfs_zip.open('trips.txt') as file: + for row in read_csv(map(bytes.decode, file)): + if row['service_id'] not in services: + services[row['service_id']] = BusService(row['service_id']) + route = routes_per_id[row['route_id']] + trip = BusTrip( + reference = row['trip_id'], + route = route, + service = services[row['service_id']], + length = shape_distances[row['shape_id']] * float(profile['metrics']['shape-modifier']) + ) + route.trips.add(trip) + assert trip.name not in all_trips + all_trips[trip.name] = trip + print('%d ajoa' % len(all_trips), file = stderr) -def filter_day(row, day): - day_names = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] - return int(row[day_names[day.isoweekday() - 1]]) + def read_date(teksti): + return date(int(teksti[:4]), int(teksti[4:6]), int(teksti[6:])) + + def read_time(teksti): + tunti, minuutti, sekunti = map(int, teksti.split(':')) + return timedelta(hours = tunti, minutes = minuutti, seconds = sekunti) + + print('Ladataan päiväykset... ', file = stderr, flush = True) + + viimeinen_käyttöpäivä = date.today() -with open('gtfs/calendar.txt') as file: - for row in read_csv(file): - for day in date_range(read_date(row['start_date']), read_date(row['end_date']), include_end = True): - if filter_day(row, day): - add_day_to_service(service_name = row['service_id'], day = day) + def date_range(start_date, end_date, *, include_end = False): + ''' Generates date from start_date to end_date. If include_end is True, then end_date will be yielded. ''' + current_date = start_date + while current_date < end_date: + yield current_date + current_date += timedelta(1) + if include_end: + yield end_date -with open('gtfs/calendar_dates.txt') as file: - for row in read_csv(file): - add_day_to_service(service_name = row['service_id'], day = read_date(row['date'])) + def add_day_to_service(service_name, day): + try: + service = services[service_name] + except KeyError: + return + else: + service.dates.add(day) + if day not in services_for_day: + services_for_day[day] = set() + services_for_day[day].add(service) + global viimeinen_käyttöpäivä + viimeinen_käyttöpäivä = max(day, viimeinen_käyttöpäivä) -def services_available_at(day): - for service in services.values(): - if day in service.dates: - yield service + def filter_day(row, day): + day_names = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] + return int(row[day_names[day.isoweekday() - 1]]) -print('Ladataan pysäkit... ', file = stderr, end = '', flush = True) -with open('gtfs/stops.txt') as file: - for row in read_csv(file): - location = Sijainti(float(row['stop_lat']), float(row['stop_lon'])) - stop = BusStop( - reference = row['stop_id'], - name = row['stop_name'], - location = location, - code = row['stop_code'], - ) - bus_stops[stop.reference] = stop -with open('regions-per-stop.json') as file: - for stop_reference, region in json.load(file).items(): - bus_stops[stop_reference].region = region -print('%d pysäkkiä' % len(bus_stops), file = stderr) + with gtfs_zip.open('calendar.txt') as file: + for row in read_csv(map(bytes.decode, file)): + for day in date_range(read_date(row['start_date']), read_date(row['end_date']), include_end = True): + if filter_day(row, day): + add_day_to_service(service_name = row['service_id'], day = day) + + with gtfs_zip.open('calendar_dates.txt') as file: + for row in read_csv(map(bytes.decode, file)): + add_day_to_service(service_name = row['service_id'], day = read_date(row['date'])) + + def services_available_at(day): + for service in services.values(): + if day in service.dates: + yield service + + print('Ladataan pysäkit... ', file = stderr, end = '', flush = True) + with gtfs_zip.open('stops.txt') as file: + for row in read_csv(map(bytes.decode, file)): + location = Sijainti(float(row['stop_lat']), float(row['stop_lon'])) + stop = BusStop( + reference = row['stop_id'], + name = row['stop_name'], + location = location, + code = row['stop_code'], + ) + bus_stops[stop.reference] = stop + with open('regions-per-stop.json') as file: + for stop_reference, region in json.load(file).items(): + bus_stops[stop_reference].region = region + print('%d pysäkkiä' % len(bus_stops), file = stderr) -class BusStopCluster: - def __init__(self): - self.stops = set() - self._center = None - self.name = None - @property - def url_name(self): - return self.name.lower().replace('(', '').replace(')', '').replace(' ', '-') - def add_stop(self, stop): - assert not stop.cluster - stop.cluster = self - self.stops.add(stop) - self._center = None - @property - def center(self): - if not self._center: - if self.stops: - from statistics import median - pointtype = type(next(iter(self.stops)).location) - self._center = pointtype( - median(stop.location.x for stop in self.stops), - median(stop.location.y for stop in self.stops), - ) - else: - raise ValueError('an empty cluster has no center point') - return self._center - def merge(self, other): - for bus_stop in other.stops: - bus_stop.cluster = self - self.stops |= other.stops - other.stops = set() - other._center = None - def schedule(self, max_amount = 50): - result = [] - for stop in self.stops: - result += stop.schedule(max_amount) - result.sort(key = lambda schedule_entry: schedule_entry['time']) - return result[:max_amount] + class BusStopCluster: + def __init__(self): + self.stops = set() + self._center = None + self.name = None + @property + def url_name(self): + return self.name.lower().replace('(', '').replace(')', '').replace(' ', '-') + def add_stop(self, stop): + assert not stop.cluster + stop.cluster = self + self.stops.add(stop) + self._center = None + @property + def center(self): + if not self._center: + if self.stops: + from statistics import median + pointtype = type(next(iter(self.stops)).location) + self._center = pointtype( + median(stop.location.x for stop in self.stops), + median(stop.location.y for stop in self.stops), + ) + else: + raise ValueError('an empty cluster has no center point') + return self._center + def merge(self, other): + for bus_stop in other.stops: + bus_stop.cluster = self + self.stops |= other.stops + other.stops = set() + other._center = None + def schedule(self, max_amount = 50): + result = [] + for stop in self.stops: + result += stop.schedule(max_amount) + result.sort(key = lambda schedule_entry: schedule_entry['time']) + return result[:max_amount] -from collections import defaultdict -bus_stops_by_name = defaultdict(set) -for bus_stop in bus_stops.values(): - bus_stops_by_name[bus_stop.name].add(bus_stop) -bus_stops_by_name = dict(bus_stops_by_name) + from collections import defaultdict + bus_stops_by_name = defaultdict(set) + for bus_stop in bus_stops.values(): + bus_stops_by_name[bus_stop.name].add(bus_stop) + bus_stops_by_name = dict(bus_stops_by_name) -# ryhmittele bus_stops nimen mukaan -all_clusters = [] -def cluster_bus_stops(): - sorted_bus_stops = sorted(bus_stops.values(), key = lambda bus_stop: bus_stop.name) - for bus_stop in sorted_bus_stops: - if not bus_stop.cluster: - stops_to_cluster = {bus_stop} - # etsi pysäkin samannimiset vastaparit - for pair_candidate in bus_stops_by_name[bus_stop.name]: - distance = pair_candidate.location.etäisyys(bus_stop.location) - if pair_candidate is not bus_stop and distance <= 0.4: - stops_to_cluster.add(pair_candidate) - for stop_to_cluster in stops_to_cluster: - if stop_to_cluster.cluster: - cluster = stop_to_cluster.cluster - break - else: - cluster = BusStopCluster() - all_clusters.append(cluster) - for stop_to_cluster in stops_to_cluster: - if not stop_to_cluster.cluster: - cluster.add_stop(stop_to_cluster) - # Merkitse muistiin pysäkkien vastaparit käyttäen hyväksi tämänhetkistä ryhmittelytietoa - for bus_stop in bus_stops.values(): - if bus_stop.cluster: - bus_stop.pairs = bus_stop.cluster.stops - {bus_stop} - # Ryhmitä ne bus_stops, joilla ei ollut omaa vastaparia, muiden pysäkkien kanssa - for bus_stop in sorted_bus_stops: - if len(bus_stop.cluster.stops) == 1: - possibilities = set() - for cluster in all_clusters: - if cluster is not bus_stop.cluster: - distance = cluster.center.etäisyys(bus_stop.location) - if distance <= 0.4: - possibilities.add((distance, cluster)) - if possibilities: - best = min(possibilities)[1] - all_clusters.remove(bus_stop.cluster) - best.merge(bus_stop.cluster) + # ryhmittele bus_stops nimen mukaan + all_clusters = [] + def cluster_bus_stops(): + sorted_bus_stops = sorted(bus_stops.values(), key = lambda bus_stop: bus_stop.name) + for bus_stop in sorted_bus_stops: + if not bus_stop.cluster: + stops_to_cluster = {bus_stop} + # etsi pysäkin samannimiset vastaparit + for pair_candidate in bus_stops_by_name[bus_stop.name]: + distance = pair_candidate.location.etäisyys(bus_stop.location) + if pair_candidate is not bus_stop and distance <= 0.4: + stops_to_cluster.add(pair_candidate) + for stop_to_cluster in stops_to_cluster: + if stop_to_cluster.cluster: + cluster = stop_to_cluster.cluster + break + else: + cluster = BusStopCluster() + all_clusters.append(cluster) + for stop_to_cluster in stops_to_cluster: + if not stop_to_cluster.cluster: + cluster.add_stop(stop_to_cluster) + # Merkitse muistiin pysäkkien vastaparit käyttäen hyväksi tämänhetkistä ryhmittelytietoa + for bus_stop in bus_stops.values(): + if bus_stop.cluster: + bus_stop.pairs = bus_stop.cluster.stops - {bus_stop} + # Ryhmitä ne bus_stops, joilla ei ollut omaa vastaparia, muiden pysäkkien kanssa + for bus_stop in sorted_bus_stops: + if len(bus_stop.cluster.stops) == 1: + possibilities = set() + for cluster in all_clusters: + if cluster is not bus_stop.cluster: + distance = cluster.center.etäisyys(bus_stop.location) + if distance <= 0.4: + possibilities.add((distance, cluster)) + if possibilities: + best = min(possibilities)[1] + all_clusters.remove(bus_stop.cluster) + best.merge(bus_stop.cluster) -def shared_elements_in_n_sets(sets): - from itertools import combinations - result = set() - for pair in combinations(sets, 2): - result |= pair[0] & pair[1] - return result + def shared_elements_in_n_sets(sets): + from itertools import combinations + result = set() + for pair in combinations(sets, 2): + result |= pair[0] & pair[1] + return result -def name_clusters(): - from collections import defaultdict - from pprint import pprint - clusters_per_name = defaultdict(set) - for cluster in all_clusters: - name_representing_stop = min((len(stop.reference), stop.reference, stop) for stop in cluster.stops)[2] - clusters_per_name[name_representing_stop.name].add(cluster) - for name, clusters in clusters_per_name.items(): - if len(clusters) == 1: - # Ryhmä on ainoa jolla on varaus tälle nimelle. Sen kuin vaan. - next(iter(clusters)).name = name - else: - # Olisiko kaikki klusterit eri alueilla? - common_regions = shared_elements_in_n_sets({stop.region for stop in cluster.stops} for cluster in clusters) - # Esitys: ryhmä -> ne alueet jotka ovat tälle ryhmälle ainutlaatuisia - proposal = { - cluster: {stop.region for stop in cluster.stops} - common_regions - {None} - for cluster in clusters - } - # Jos enintään yksi klusteri tässä esityksessä on kokonaan ilman omaa aluetta, jolla se voisi eritellä, - # niin nimetään klusterit näiden alueiden mukaan. - # Se klusteri jolla ei ole omaa aluetta (jos on) jätetään ilman aluepäätettä. - if sum([1 for unique_areas in proposal.values() if not unique_areas]) <= 1: - for cluster, unique_areas in proposal.items(): - individual_cluster_name = name - if unique_areas: - individual_cluster_name += ' (' + min(unique_areas) + ')' - cluster.name = individual_cluster_name + def name_clusters(): + from collections import defaultdict + from pprint import pprint + clusters_per_name = defaultdict(set) + for cluster in all_clusters: + name_representing_stop = min((len(stop.reference), stop.reference, stop) for stop in cluster.stops)[2] + clusters_per_name[name_representing_stop.name].add(cluster) + for name, clusters in clusters_per_name.items(): + if len(clusters) == 1: + # Ryhmä on ainoa jolla on varaus tälle nimelle. Sen kuin vaan. + next(iter(clusters)).name = name + else: + # Olisiko kaikki klusterit eri alueilla? + common_regions = shared_elements_in_n_sets({stop.region for stop in cluster.stops} for cluster in clusters) + # Esitys: ryhmä -> ne alueet jotka ovat tälle ryhmälle ainutlaatuisia + proposal = { + cluster: {stop.region for stop in cluster.stops} - common_regions - {None} + for cluster in clusters + } + # Jos enintään yksi klusteri tässä esityksessä on kokonaan ilman omaa aluetta, jolla se voisi eritellä, + # niin nimetään klusterit näiden alueiden mukaan. + # Se klusteri jolla ei ole omaa aluetta (jos on) jätetään ilman aluepäätettä. + if sum([1 for unique_areas in proposal.values() if not unique_areas]) <= 1: + for cluster, unique_areas in proposal.items(): + individual_cluster_name = name + if unique_areas: + individual_cluster_name += ' (' + min(unique_areas) + ')' + cluster.name = individual_cluster_name + else: + # Typerä reunatapaus. Indeksoidaan numeroin... + for n, (_, cluster) in enumerate(sorted( + min((stop.reference.lower(), cluster) for stop in cluster.stops) + for cluster in clusters + ), 1): + individual_cluster_name = name + '-' + str(n) + cluster.name = individual_cluster_name + + print('Ryhmitellään pysäkit...') + cluster_bus_stops() + name_clusters() + + for cluster in all_clusters: + if cluster.url_name in clusters_by_name: + print('Warning: Clusters %r and %r share the same URL name: %r' % (cluster.name, clusters_by_name[cluster.url_name].name, cluster.url_name)) else: - # Typerä reunatapaus. Indeksoidaan numeroin... - for n, (_, cluster) in enumerate(sorted( - min((stop.reference.lower(), cluster) for stop in cluster.stops) - for cluster in clusters - ), 1): - individual_cluster_name = name + '-' + str(n) - cluster.name = individual_cluster_name - -print('Ryhmitellään pysäkit...') -cluster_bus_stops() -name_clusters() - -clusters_by_name = {} -for cluster in all_clusters: - if cluster.url_name in clusters_by_name: - print('Warning: Clusters %r and %r share the same URL name: %r' % (cluster.name, clusters_by_name[cluster.url_name].name, cluster.url_name)) - else: - clusters_by_name[cluster.url_name] = cluster + clusters_by_name[cluster.url_name] = cluster -print('Ladataan aikataulut... ', end = '', flush = True, file = stderr) -with open('gtfs/stop_times.txt') as file: - row_count = sum(line.count('\n') for line in file) - progress = 0 - file.seek(0) - for row in read_csv(file): - trip = all_trips[transform_trip_reference(row['trip_id'])] - arrival_time = read_time(row['arrival_time']) - departure_time = read_time(row['departure_time']) - stop = bus_stops[row['stop_id']] - traveled_distance = float(row['shape_dist_traveled']) - trip.schedule.append(BusHalt(arrival_time, departure_time, stop, trip, traveled_distance)) - stop.involved_trips.add(trip) - progress += 1 - if progress % 1000 == 0: - print('\rLadataan aikataulut... %.1f%%' % (progress * 100 / row_count), end = ' ', file = stderr) -print('\rLadataan aikataulut... ladattu', file = stderr) + print('Ladataan aikataulut... ', end = '', flush = True, file = stderr) + with gtfs_zip.open('stop_times.txt') as file: + row_count = sum(line.count(b'\n') for line in file) + with gtfs_zip.open('stop_times.txt') as file: + progress = 0 + for row in read_csv(map(bytes.decode, file)): + trip = all_trips[transform_trip_reference(row['trip_id'])] + arrival_time = read_time(row['arrival_time']) + departure_time = read_time(row['departure_time']) + stop = bus_stops[row['stop_id']] + traveled_distance = float(row['shape_dist_traveled']) * float(profile['metrics']['shape-modifier']) + trip.schedule.append(BusHalt(arrival_time, departure_time, stop, trip, traveled_distance)) + stop.involved_trips.add(trip) + progress += 1 + if progress % 1000 == 0: + print('\rLadataan aikataulut... %.1f%%' % (progress * 100 / row_count), end = ' ', file = stderr) + print('\rLadataan aikataulut... ladattu', file = stderr) + + for trip in all_trips.values(): + from busroute import simplify_name + schedule = trip.concise_schedule() + try: + trip.from_place = simplify_name(schedule[0]) + trip.to_place = simplify_name(schedule[-1]) + except IndexError: + trip.from_place = '' + trip.to_place = '' -for trip in all_trips.values(): - from busroute import simplify_name - schedule = trip.concise_schedule() - try: - trip.from_place = simplify_name(schedule[0]) - trip.to_place = simplify_name(schedule[-1]) - except IndexError: - trip.from_place = '' - trip.to_place = '' + for route in routes.values(): + from collections import Counter + from busroute import simplify_name + tally = Counter() + for trip in route.trips: + schedule = trip.concise_schedule() + places = set(schedule) + do_add = True + assert type(schedule) is list + for candidate in tally: + if places.issubset(set(candidate)): + do_add = False + tally.update({tuple(candidate)}) + if do_add: + tally.update({tuple(schedule)}) + try: + most_common_route = tally.most_common(1)[0][0] + route.description = simplify_name(most_common_route[0]) + ' - ' + simplify_name(most_common_route[-1]) + except: + route.description = '' + route.trips = sorted(route.trips, key = lambda trip: trip.schedule[0].departure_time) -for route in routes.values(): - from collections import Counter - from busroute import simplify_name - tally = Counter() - for trip in route.trips: - schedule = trip.concise_schedule() - places = set(schedule) - do_add = True - assert type(schedule) is list - for candidate in tally: - if places.issubset(set(candidate)): - do_add = False - tally.update({tuple(candidate)}) - if do_add: - tally.update({tuple(schedule)}) - try: - most_common_route = tally.most_common(1)[0][0] - route.description = simplify_name(most_common_route[0]) + ' - ' + simplify_name(most_common_route[-1]) - except: - route.description = '' - route.trips = sorted(route.trips, key = lambda trip: trip.schedule[0].departure_time) + # Fölin datassa on jotain tosi kummaa. Ilmeisesti ajovuoron viimeisen pysähdyksen saapumisaika on ihan täysin + # väärin. Arvaan että se on seuraavan lähdön aika, mutta joka tapauksessa se on väärin. + # Arvataan mikä se todellinen saapumisaika on. Se ei voi mennä kauhean paljon pahemmin vikaan kuin alkuperäinen + # väärin oleva data. + for trip in all_trips.values(): + bus_speed_coefficient = 750 # metriä minuutissa + last_leg_distance = trip.schedule[-1].traveled_distance - trip.schedule[-2].traveled_distance + trip.schedule[-1].arrival_time = trip.schedule[-2].departure_time + timedelta(minutes = last_leg_distance / bus_speed_coefficient) + +if __name__ == '__main__': + from configparser import ConfigParser + profile = ConfigParser() + profile.read('profiles/föli.ini') + load_buses('gtfs.zip', profile)