Fri, 05 Feb 2021 12:16:29 +0200
update
def old_load_gtfs(gtfs_zip_path): global viimeinen_käyttöpäivä from zipfile import ZipFile with ZipFile(gtfs_zip_path) as gtfs_zip: with gtfs_zip.open('trips.txt') as file: for row in read_csv(map(bytes.decode, file)): if row['service_id'] not in services: services[row['service_id']] = BusService(row['service_id']) route = routes_per_id[row['route_id']] trip = GtfsTrip( reference = row['trip_id'], route = route, service = services[row['service_id']], length = shape_distances.get(row.get('shape_id'), 1) * float(profile['metrics']['shape-modifier']), block_id = row.get('block_id') or row['service_id'], shape = row.get('shape_id') ) route.trips.add(trip) if trip.name in all_trips: print('Trip %s already exists' % trip.name) else: all_trips[trip.name] = trip print('%d trips' % len(all_trips), file = stderr) def read_date(teksti): return date(int(teksti[:4]), int(teksti[4:6]), int(teksti[6:])) def read_time(teksti): hour, minute, second = map(int, teksti.split(':')) return timedelta(hours = hour, minutes = minute, seconds = second) print('Loading dates... ', file = stderr, flush = True) viimeinen_käyttöpäivä = date.today() def date_range(start_date, end_date, *, include_end = False): ''' Generates date from start_date to end_date. If include_end is True, then end_date will be yielded. ''' current_date = start_date while current_date < end_date: yield current_date current_date += timedelta(1) if include_end: yield end_date def add_day_to_service(service_name, day): try: service = services[service_name] except KeyError: return else: service.dates.add(day) if day not in services_for_day: services_for_day[day] = set() services_for_day[day].add(service) global viimeinen_käyttöpäivä viimeinen_käyttöpäivä = max(day, viimeinen_käyttöpäivä) def filter_day(row, day): day_names = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] return int(row[day_names[day.isoweekday() - 1]]) with gtfs_zip.open('calendar.txt') as file: for row in read_csv(map(bytes.decode, file)): for day in date_range(read_date(row['start_date']), read_date(row['end_date']), include_end = True): if filter_day(row, day): add_day_to_service(service_name = row['service_id'], day = day) with gtfs_zip.open('calendar_dates.txt') as file: for row in read_csv(map(bytes.decode, file)): add_day_to_service(service_name = row['service_id'], day = read_date(row['date'])) def services_available_at(day): for service in services.values(): if day in service.dates: yield service print('Loading stops... ', file = stderr, end = '', flush = True) with gtfs_zip.open('stops.txt') as file: for row in read_csv(map(bytes.decode, file)): location = Location(float(row['stop_lat']), float(row['stop_lon'])) stop = BusStop( reference = row['stop_id'], name = row['stop_name'], location = location, code = row.get('stop_code', row['stop_id']), ) bus_stops[stop.reference] = stop if profile['regions']['use-regions']: with open('regions-per-stop.json') as file: for stop_reference, region in json.load(file).items(): try: bus_stops[stop_reference].region = region except KeyError: pass for bus_stop in bus_stops.values(): if not hasattr(bus_stop, 'region'): bus_stop.region = None print('%d stops' % len(bus_stops), file = stderr) from collections import defaultdict bus_stops_by_name = defaultdict(set) for bus_stop in bus_stops.values(): bus_stops_by_name[bus_stop.name].add(bus_stop) bus_stops_by_name = dict(bus_stops_by_name) # ryhmittele bus_stops nimen mukaan global all_clusters all_clusters = [] def cluster_gtfs_stops(): sorted_gtfs_stops = sorted(bus_stops.values(), key = lambda bus_stop: bus_stop.name) for bus_stop in sorted_gtfs_stops: if not bus_stop.cluster: stops_to_cluster = {bus_stop} # etsi pysäkin samannimiset vastaparit for pair_candidate in bus_stops_by_name[bus_stop.name]: distance = pair_candidate.location.distance(bus_stop.location) if pair_candidate is not bus_stop and distance <= 0.4: stops_to_cluster.add(pair_candidate) for stop_to_cluster in stops_to_cluster: if stop_to_cluster.cluster: cluster = stop_to_cluster.cluster break else: cluster = BusStopCluster() all_clusters.append(cluster) for stop_to_cluster in stops_to_cluster: if not stop_to_cluster.cluster: cluster.add_stop(stop_to_cluster) # Merkitse muistiin pysäkkien vastaparit käyttäen hyväksi tämänhetkistä ryhmittelytietoa for bus_stop in bus_stops.values(): if bus_stop.cluster: bus_stop.pairs = bus_stop.cluster.stops - {bus_stop} # Ryhmitä ne bus_stops, joilla ei ollut omaa vastaparia, muiden pysäkkien kanssa for bus_stop in sorted_gtfs_stops: if len(bus_stop.cluster.stops) == 1: possibilities = set() for cluster in all_clusters: if cluster is not bus_stop.cluster: distance = cluster.center.distance(bus_stop.location) if distance <= 0.4: possibilities.add((distance, cluster)) if possibilities: best = min(possibilities)[1] all_clusters.remove(bus_stop.cluster) best.merge(bus_stop.cluster) def shared_elements_in_n_sets(sets): from itertools import combinations result = set() for pair in combinations(sets, 2): result |= pair[0] & pair[1] return result def name_clusters(): from collections import defaultdict clusters_per_name = defaultdict(set) for cluster in all_clusters: name_representing_stop = min((len(stop.reference), stop.reference, stop) for stop in cluster.stops)[2] clusters_per_name[name_representing_stop.name].add(cluster) for name, clusters in clusters_per_name.items(): if len(clusters) == 1: # Simple case: this cluster is the only one that wants this name. next(iter(clusters)).name = name else: if profile['regions']['use-regions']: # Find out if all clusters are in different areas common_regions = shared_elements_in_n_sets({stop.region for stop in cluster.stops} for cluster in clusters) # Proposal: cluster -> the areas unique to the cluster proposal = { cluster: {stop.region for stop in cluster.stops} - common_regions - {None} for cluster in clusters } # If at most one cluster is without its own unique region, name the others by region and this one without any. if sum([1 for unique_areas in proposal.values() if not unique_areas]) <= 1: for cluster, unique_areas in proposal.items(): individual_cluster_name = name if unique_areas: individual_cluster_name += ' (' + min(unique_areas) + ')' cluster.name = individual_cluster_name break # If all else fails, just number them. for n, (_, cluster) in enumerate(sorted( min((stop.reference.lower(), cluster) for stop in cluster.stops) for cluster in clusters ), 1): individual_cluster_name = name + '-' + str(n) cluster.name = individual_cluster_name print('Clustering bus stops...') cluster_gtfs_stops() name_clusters() for cluster in all_clusters: if cluster.url_name in clusters_by_name: print('Warning: Clusters %r and %r share the same URL name: %r' % (cluster.name, clusters_by_name[cluster.url_name].name, cluster.url_name)) else: clusters_by_name[cluster.url_name] = cluster print('Loading schedules... ', end = '', flush = True, file = stderr) with gtfs_zip.open('stop_times.txt') as file: row_count = sum(line.count(b'\n') for line in file) with gtfs_zip.open('stop_times.txt') as file: progress = 0 for row in read_csv(map(bytes.decode, file)): if int(row.get('pickup_type', '') or '0') and int(row.get('drop_off_type', '') or '0'): continue trip = all_trips[transform_trip_reference(row['trip_id'])] arrival_time = read_time(row['arrival_time']) departure_time = read_time(row['departure_time']) stop = bus_stops[row['stop_id']] traveled_distance = float(row.get('shape_dist_traveled', 1)) * float(profile['metrics']['shape-modifier']) visitnumber = len(trip.schedule) + 1 trip.schedule.append(BusHalt(arrival_time, departure_time, stop, trip, traveled_distance, visitnumber)) stop.involved_trips.add(trip) progress += 1 if progress % 1000 == 0: print('\rLoading schedules... %.1f%%' % (progress * 100 / row_count), end = ' ', file = stderr) print('\rLoading schedules... complete', file = stderr) for trip in all_trips.values(): from busroute import simplify_name schedule = trip.concise_schedule() try: trip.from_place = simplify_name(schedule[0]) trip.to_place = simplify_name(schedule[-1]) except IndexError: trip.from_place = '' trip.to_place = '' for route in routes.values(): from collections import Counter from busroute import simplify_name tally = Counter() for trip in route.trips: schedule = trip.concise_schedule() places = set(schedule) do_add = True assert type(schedule) is list for candidate in tally: if places.issubset(set(candidate)): do_add = False tally.update({tuple(candidate)}) if do_add: tally.update({tuple(schedule)}) try: most_common_route = tally.most_common(1)[0][0] route.description = simplify_name(most_common_route[0]) + ' - ' + simplify_name(most_common_route[-1]) except: route.description = '' route.trips = sorted(route.trips, key = lambda trip: trip.schedule and trip.schedule[0].departure_time or timedelta()) if 'compatibility' in profile and profile['compatibility'].get('fix-destination-times', False): # There seems to be something strange going on in Föli's gtfs data. # It seems that sometimes the arrival time of the last stop is # completely off, so try estimate when the bus will really arrive # there based on the last leg distance. # I noticed this for bus 220's arrival time at Mylly several years # ago. Possibly this has been fixed in the data by now? for trip in all_trips.values(): if len(trip.schedule) >= 2: bus_speed_coefficient = 750 # meters per minute last_leg_distance = trip.schedule[-1].traveled_distance - trip.schedule[-2].traveled_distance trip.schedule[-1].arrival_time = trip.schedule[-2].departure_time + timedelta(minutes = last_leg_distance / bus_speed_coefficient) # Add services to all bus stops for route in routes.values(): for trip in route.trips: for halt in trip.schedule: halt.stop.services.add(route.service)