Wed, 28 Jun 2017 17:52:42 +0300
Ääh
#!/usr/bin/env python3 import enum, json from sys import stderr from datetime import date, time, datetime, timedelta from copy import copy from misc import * from geometry import * def transform_trip_reference(reference): return reference class BusTrip: def __init__(self, reference, route, service, length): self.reference, self.route, self.service = reference, route, service self.length = length self.schedule = [] self.name = transform_trip_reference(reference) def __repr__(self): return 'all_trips[%r]' % self.name def contains_stop(self, stop): for halt in self.schedule: if halt.stop is stop: return halt else: return None def is_served_at(self, day): try: return self.service in services_for_day[day] except KeyError: return False def concise_schedule(self, starting_stop = None): if starting_stop and starting_stop in self.schedule: schedule = copy(self.schedule) schedule = schedule[schedule.index(starting_stop):] else: schedule = self.schedule used_areas = set() result = [] for halt in schedule: stop = halt.stop if stop.region and stop.region not in used_areas: used_areas.add(stop.region) result.append(stop.region) return result class BusRoute: def __init__(self, entry): self.id = entry['route_id'] self.reference = entry['route_short_name'] self.trips = set() def __repr__(self): return 'routes[%r]' % self.reference class BusService: def __init__(self, reference): self.reference = reference self.dates = set() def __repr__(self): return 'services[%r]' % self.reference class BusStop: def __init__(self, reference, name, location, code = None): self.reference, self.name, self.location = reference, name, location self.code = code or reference self.cluster = None self.pairs = set() # samannimiset lähellä olevat pysäkit self.involved_trips = set() def __repr__(self): return 'bus_stops[%r]' % self.reference def schedule(self, *, max_amount = 50, arrivals = False): ''' Hakee tämän pysäkin seuraavat `määrä` lähtöä. Päätepysäkille saapuvia busseja ei lasketa. Palauttaa pysähdykset listana jossa alkiot ovat muotoa (aika, halt), jossa: - `aika` on saapumishetki muotoa datetime ja - `halt` on vastaava BusHalt olio. Mikäli pysäkille ei ole määrätty riittävästi pysähdyksiä kalenterissa, tuloslista jää alimittaiseksi, mahdollisesti jopa tyhjäksi. ''' result = [] # -1 päivää yövuoroja varten date = today() - timedelta(days = 1) # Niin kauan kuin aikatauluja ei ole vielä tarpeeksi, while len(result) < max_amount: try: # hae nykyisen päivän aikataulut ja lisää ne, result += self.schedule_for_day(date, arrivals = arrivals) except ValueError: # paitsi jos mentiin kalenterin ulkopuolelle, jolloin lopetetaan, break # ja siirry seuraavaan päivään. date += timedelta(1) # Typistä lopputulos haluttuun tulosmäärään. return result[:max_amount] def schedule_for_day(self, date, *, arrivals = False): ''' Hakee pysäkin aikataulut tiettynä päivänä. ''' # Jos päädyttiin aikataulukalenterin ulkopuolelle, niin tuotetaan virhe. Jos vain # palautettaisiin tyhjä result, niin algoritmi jatkaisi etsintää loputtomiin. if date > viimeinen_käyttöpäivä: raise ValueError('tried to retrieve schedule for date %s which is outside schedule data' % date) result = [] # Jokaiselle ajovuorolle, for trip in self.involved_trips: # jos tämä ajovuoro ajetaan tänä päivänä if trip.is_served_at(date): # ja jos tämä trip pysähtyy tällä pysäkillä, ei kuitenkaan saapuen # päätepysäkille, stop = trip.contains_stop(self) if stop and (arrivals or not stop.is_arrival) and stop is not trip.schedule[-1]: # ja jos tämä halt on tulevaisuudessa, stop_time = datetime.combine(date, time()) + stop.arrival_time if stop_time >= now(): # lisää halt listaan. result.append({ 'time': stop_time, 'trip': trip, 'stop': stop, }) # Lajittele lopputulos saapumisajan mukaan. result.sort(key = lambda schedule_entry: schedule_entry['time']) return result class BusHalt: def __init__(self, arrival_time, departure_time, stop, trip, traveled_distance): self.arrival_time, self.departure_time, self.stop, self.trip = arrival_time, departure_time, \ stop, trip self.traveled_distance = traveled_distance @property def is_arrival(self): if not hasattr(self, 'cachedIsArrival'): if self.stop.region: iterator = iter(self.trip.schedule) stop = next(iterator) while stop is not self: stop = next(iterator) for stop in iterator: if stop.stop.region != self.stop.region: self.cachedIsArrival = False break else: self.cachedIsArrival = True else: self.cachedIsArrival = False return self.cachedIsArrival def __repr__(self): return 'BusHalt(%r, %r, %r, %r)' % (self.arrival_time, self.departure_time, self.stop, self.trip) routes = {} routes_per_id = {} all_trips = {} services = {} bus_stops = {} all_clusters = set() viimeinen_käyttöpäivä = None clusters_by_name = {} services_for_day = {} def load_buses(gtfs_zip_path, profile): global viimeinen_käyttöpäivä from zipfile import ZipFile with ZipFile(gtfs_zip_path) as gtfs_zip: print('Ladataan linjat... ', file = stderr, end = '', flush = True) with gtfs_zip.open('routes.txt') as file: for row in read_csv(map(bytes.decode, file)): route = BusRoute(row) routes[route.reference] = route routes_per_id[route.id] = route print('%d linjaa' % len(routes), file = stderr) print('Ladataan ajovuorot... ', file = stderr, end = '', flush = True) shape_distances = {} with gtfs_zip.open('shapes.txt') as file: for row in read_csv(map(bytes.decode, file)): shape_distances[row['shape_id']] = max(shape_distances.get(row['shape_id'], 0), float(row['shape_dist_traveled'])) with gtfs_zip.open('trips.txt') as file: for row in read_csv(map(bytes.decode, file)): if row['service_id'] not in services: services[row['service_id']] = BusService(row['service_id']) route = routes_per_id[row['route_id']] trip = BusTrip( reference = row['trip_id'], route = route, service = services[row['service_id']], length = shape_distances[row['shape_id']] * float(profile['metrics']['shape-modifier']) ) route.trips.add(trip) assert trip.name not in all_trips all_trips[trip.name] = trip print('%d ajoa' % len(all_trips), file = stderr) def read_date(teksti): return date(int(teksti[:4]), int(teksti[4:6]), int(teksti[6:])) def read_time(teksti): tunti, minuutti, sekunti = map(int, teksti.split(':')) return timedelta(hours = tunti, minutes = minuutti, seconds = sekunti) print('Ladataan päiväykset... ', file = stderr, flush = True) viimeinen_käyttöpäivä = date.today() def date_range(start_date, end_date, *, include_end = False): ''' Generates date from start_date to end_date. If include_end is True, then end_date will be yielded. ''' current_date = start_date while current_date < end_date: yield current_date current_date += timedelta(1) if include_end: yield end_date def add_day_to_service(service_name, day): try: service = services[service_name] except KeyError: return else: service.dates.add(day) if day not in services_for_day: services_for_day[day] = set() services_for_day[day].add(service) global viimeinen_käyttöpäivä viimeinen_käyttöpäivä = max(day, viimeinen_käyttöpäivä) def filter_day(row, day): day_names = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] return int(row[day_names[day.isoweekday() - 1]]) with gtfs_zip.open('calendar.txt') as file: for row in read_csv(map(bytes.decode, file)): for day in date_range(read_date(row['start_date']), read_date(row['end_date']), include_end = True): if filter_day(row, day): add_day_to_service(service_name = row['service_id'], day = day) with gtfs_zip.open('calendar_dates.txt') as file: for row in read_csv(map(bytes.decode, file)): add_day_to_service(service_name = row['service_id'], day = read_date(row['date'])) def services_available_at(day): for service in services.values(): if day in service.dates: yield service print('Ladataan pysäkit... ', file = stderr, end = '', flush = True) with gtfs_zip.open('stops.txt') as file: for row in read_csv(map(bytes.decode, file)): location = Sijainti(float(row['stop_lat']), float(row['stop_lon'])) stop = BusStop( reference = row['stop_id'], name = row['stop_name'], location = location, code = row['stop_code'], ) bus_stops[stop.reference] = stop with open('regions-per-stop.json') as file: for stop_reference, region in json.load(file).items(): bus_stops[stop_reference].region = region print('%d pysäkkiä' % len(bus_stops), file = stderr) class BusStopCluster: def __init__(self): self.stops = set() self._center = None self.name = None @property def url_name(self): return self.name.lower().replace('(', '').replace(')', '').replace(' ', '-') def add_stop(self, stop): assert not stop.cluster stop.cluster = self self.stops.add(stop) self._center = None @property def center(self): if not self._center: if self.stops: from statistics import median pointtype = type(next(iter(self.stops)).location) self._center = pointtype( median(stop.location.x for stop in self.stops), median(stop.location.y for stop in self.stops), ) else: raise ValueError('an empty cluster has no center point') return self._center def merge(self, other): for bus_stop in other.stops: bus_stop.cluster = self self.stops |= other.stops other.stops = set() other._center = None def schedule(self, *, max_amount = 50): result = [] for stop in self.stops: result += stop.schedule(max_amount = max_amount) result.sort(key = lambda schedule_entry: schedule_entry['time']) return result[:max_amount] from collections import defaultdict bus_stops_by_name = defaultdict(set) for bus_stop in bus_stops.values(): bus_stops_by_name[bus_stop.name].add(bus_stop) bus_stops_by_name = dict(bus_stops_by_name) # ryhmittele bus_stops nimen mukaan global all_clusters all_clusters = [] def cluster_bus_stops(): sorted_bus_stops = sorted(bus_stops.values(), key = lambda bus_stop: bus_stop.name) for bus_stop in sorted_bus_stops: if not bus_stop.cluster: stops_to_cluster = {bus_stop} # etsi pysäkin samannimiset vastaparit for pair_candidate in bus_stops_by_name[bus_stop.name]: distance = pair_candidate.location.etäisyys(bus_stop.location) if pair_candidate is not bus_stop and distance <= 0.4: stops_to_cluster.add(pair_candidate) for stop_to_cluster in stops_to_cluster: if stop_to_cluster.cluster: cluster = stop_to_cluster.cluster break else: cluster = BusStopCluster() all_clusters.append(cluster) for stop_to_cluster in stops_to_cluster: if not stop_to_cluster.cluster: cluster.add_stop(stop_to_cluster) # Merkitse muistiin pysäkkien vastaparit käyttäen hyväksi tämänhetkistä ryhmittelytietoa for bus_stop in bus_stops.values(): if bus_stop.cluster: bus_stop.pairs = bus_stop.cluster.stops - {bus_stop} # Ryhmitä ne bus_stops, joilla ei ollut omaa vastaparia, muiden pysäkkien kanssa for bus_stop in sorted_bus_stops: if len(bus_stop.cluster.stops) == 1: possibilities = set() for cluster in all_clusters: if cluster is not bus_stop.cluster: distance = cluster.center.etäisyys(bus_stop.location) if distance <= 0.4: possibilities.add((distance, cluster)) if possibilities: best = min(possibilities)[1] all_clusters.remove(bus_stop.cluster) best.merge(bus_stop.cluster) def shared_elements_in_n_sets(sets): from itertools import combinations result = set() for pair in combinations(sets, 2): result |= pair[0] & pair[1] return result def name_clusters(): from collections import defaultdict from pprint import pprint clusters_per_name = defaultdict(set) for cluster in all_clusters: name_representing_stop = min((len(stop.reference), stop.reference, stop) for stop in cluster.stops)[2] clusters_per_name[name_representing_stop.name].add(cluster) for name, clusters in clusters_per_name.items(): if len(clusters) == 1: # Ryhmä on ainoa jolla on varaus tälle nimelle. Sen kuin vaan. next(iter(clusters)).name = name else: # Olisiko kaikki klusterit eri alueilla? common_regions = shared_elements_in_n_sets({stop.region for stop in cluster.stops} for cluster in clusters) # Esitys: ryhmä -> ne alueet jotka ovat tälle ryhmälle ainutlaatuisia proposal = { cluster: {stop.region for stop in cluster.stops} - common_regions - {None} for cluster in clusters } # Jos enintään yksi klusteri tässä esityksessä on kokonaan ilman omaa aluetta, jolla se voisi eritellä, # niin nimetään klusterit näiden alueiden mukaan. # Se klusteri jolla ei ole omaa aluetta (jos on) jätetään ilman aluepäätettä. if sum([1 for unique_areas in proposal.values() if not unique_areas]) <= 1: for cluster, unique_areas in proposal.items(): individual_cluster_name = name if unique_areas: individual_cluster_name += ' (' + min(unique_areas) + ')' cluster.name = individual_cluster_name else: # Typerä reunatapaus. Indeksoidaan numeroin... for n, (_, cluster) in enumerate(sorted( min((stop.reference.lower(), cluster) for stop in cluster.stops) for cluster in clusters ), 1): individual_cluster_name = name + '-' + str(n) cluster.name = individual_cluster_name print('Ryhmitellään pysäkit...') cluster_bus_stops() name_clusters() for cluster in all_clusters: if cluster.url_name in clusters_by_name: print('Warning: Clusters %r and %r share the same URL name: %r' % (cluster.name, clusters_by_name[cluster.url_name].name, cluster.url_name)) else: clusters_by_name[cluster.url_name] = cluster print('Ladataan aikataulut... ', end = '', flush = True, file = stderr) with gtfs_zip.open('stop_times.txt') as file: row_count = sum(line.count(b'\n') for line in file) with gtfs_zip.open('stop_times.txt') as file: progress = 0 for row in read_csv(map(bytes.decode, file)): trip = all_trips[transform_trip_reference(row['trip_id'])] arrival_time = read_time(row['arrival_time']) departure_time = read_time(row['departure_time']) stop = bus_stops[row['stop_id']] traveled_distance = float(row['shape_dist_traveled']) * float(profile['metrics']['shape-modifier']) trip.schedule.append(BusHalt(arrival_time, departure_time, stop, trip, traveled_distance)) stop.involved_trips.add(trip) progress += 1 if progress % 1000 == 0: print('\rLadataan aikataulut... %.1f%%' % (progress * 100 / row_count), end = ' ', file = stderr) print('\rLadataan aikataulut... ladattu', file = stderr) for trip in all_trips.values(): from busroute import simplify_name schedule = trip.concise_schedule() try: trip.from_place = simplify_name(schedule[0]) trip.to_place = simplify_name(schedule[-1]) except IndexError: trip.from_place = '' trip.to_place = '' for route in routes.values(): from collections import Counter from busroute import simplify_name tally = Counter() for trip in route.trips: schedule = trip.concise_schedule() places = set(schedule) do_add = True assert type(schedule) is list for candidate in tally: if places.issubset(set(candidate)): do_add = False tally.update({tuple(candidate)}) if do_add: tally.update({tuple(schedule)}) try: most_common_route = tally.most_common(1)[0][0] route.description = simplify_name(most_common_route[0]) + ' - ' + simplify_name(most_common_route[-1]) except: route.description = '' route.trips = sorted(route.trips, key = lambda trip: trip.schedule[0].departure_time) # Fölin datassa on jotain tosi kummaa. Ilmeisesti ajovuoron viimeisen pysähdyksen saapumisaika on ihan täysin # väärin. Arvaan että se on seuraavan lähdön aika, mutta joka tapauksessa se on väärin. # Arvataan mikä se todellinen saapumisaika on. Se ei voi mennä kauhean paljon pahemmin vikaan kuin alkuperäinen # väärin oleva data. for trip in all_trips.values(): bus_speed_coefficient = 750 # metriä minuutissa last_leg_distance = trip.schedule[-1].traveled_distance - trip.schedule[-2].traveled_distance trip.schedule[-1].arrival_time = trip.schedule[-2].departure_time + timedelta(minutes = last_leg_distance / bus_speed_coefficient) if __name__ == '__main__': from configparser import ConfigParser profile = ConfigParser() profile.read('profiles/föli.ini') load_buses('gtfs.zip', profile)