diff -r 659ab465152e -r f9788970fa46 buses.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/buses.py	Wed Jul 29 23:45:53 2020 +0300
@@ -0,0 +1,263 @@
+
+def old_load_gtfs(gtfs_zip_path):
+	global viimeinen_käyttöpäivä
+	from zipfile import ZipFile
+	with ZipFile(gtfs_zip_path) as gtfs_zip:
+		with gtfs_zip.open('trips.txt') as file:
+			for row in read_csv(map(bytes.decode, file)):
+				if row['service_id'] not in services:
+					services[row['service_id']] = BusService(row['service_id'])
+				route = routes_per_id[row['route_id']]
+				trip = GtfsTrip(
+					reference = row['trip_id'],
+					route = route,
+					service = services[row['service_id']],
+					length = shape_distances.get(row.get('shape_id'), 1) * float(profile['metrics']['shape-modifier']),
+					block_id = row.get('block_id') or row['service_id'],
+					shape = row.get('shape_id')
+				)
+				route.trips.add(trip)
+				if trip.name in all_trips:
+					print('Trip %s already exists' % trip.name)
+				else:
+					all_trips[trip.name] = trip
+		print('%d trips' % len(all_trips), file = stderr)
+
+		def read_date(teksti):
+			return date(int(teksti[:4]), int(teksti[4:6]), int(teksti[6:]))
+
+		def read_time(teksti):
+			hour, minute, second = map(int, teksti.split(':'))
+			return timedelta(hours = hour, minutes = minute, seconds = second)
+
+		print('Loading dates... ', file = stderr, flush = True)
+		viimeinen_käyttöpäivä = date.today()
+
+		def date_range(start_date, end_date, *, include_end = False):
+			''' Generates date from start_date to end_date. If include_end is True, then end_date will be yielded. '''
+			current_date = start_date
+			while current_date < end_date:
+				yield current_date
+				current_date += timedelta(1)
+			if include_end:
+				yield end_date
+
+		def add_day_to_service(service_name, day):
+			try:
+				service = services[service_name]
+			except KeyError:
+				return
+			else:
+				service.dates.add(day)
+				if day not in services_for_day:
+					services_for_day[day] = set()
+				services_for_day[day].add(service)
+				global viimeinen_käyttöpäivä
+				viimeinen_käyttöpäivä = max(day, viimeinen_käyttöpäivä)
+
+		def filter_day(row, day):
+			day_names = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
+			return int(row[day_names[day.isoweekday() - 1]])
+
+		with gtfs_zip.open('calendar.txt') as file:
+			for row in read_csv(map(bytes.decode, file)):
+				for day in date_range(read_date(row['start_date']), read_date(row['end_date']), include_end = True):
+					if filter_day(row, day):
+						add_day_to_service(service_name = row['service_id'], day = day)
+
+		with gtfs_zip.open('calendar_dates.txt') as file:
+			for row in read_csv(map(bytes.decode, file)):
+				add_day_to_service(service_name = row['service_id'], day = read_date(row['date']))
+
+		def services_available_at(day):
+			for service in services.values():
+				if day in service.dates:
+					yield service
+
+		print('Loading stops... ', file = stderr, end = '', flush = True)
+		with gtfs_zip.open('stops.txt') as file:
+			for row in read_csv(map(bytes.decode, file)):
+				location = Location(float(row['stop_lat']), float(row['stop_lon']))
+				stop = BusStop(
+					reference = row['stop_id'],
+					name = row['stop_name'],
+					location = location, 
+					code = row.get('stop_code', row['stop_id']),
+				)
+				bus_stops[stop.reference] = stop
+		if profile['regions']['use-regions']:
+			with open('regions-per-stop.json') as file:
+				for stop_reference, region in json.load(file).items():
+					try:
+						bus_stops[stop_reference].region = region
+					except KeyError:
+						pass
+			for bus_stop in bus_stops.values():
+				if not hasattr(bus_stop, 'region'):
+					bus_stop.region = None
+		print('%d stops' % len(bus_stops), file = stderr)
+
+		from collections import defaultdict
+		bus_stops_by_name = defaultdict(set)
+		for bus_stop in bus_stops.values():
+			bus_stops_by_name[bus_stop.name].add(bus_stop)
+		bus_stops_by_name = dict(bus_stops_by_name)
+
+		# ryhmittele bus_stops nimen mukaan
+		global all_clusters
+		all_clusters = []
+		def cluster_gtfs_stops():
+			sorted_gtfs_stops = sorted(bus_stops.values(), key = lambda bus_stop: bus_stop.name)
+			for bus_stop in sorted_gtfs_stops:
+				if not bus_stop.cluster:
+					stops_to_cluster = {bus_stop}
+					# etsi pysäkin samannimiset vastaparit
+					for pair_candidate in bus_stops_by_name[bus_stop.name]:
+						distance = pair_candidate.location.distance(bus_stop.location)
+						if pair_candidate is not bus_stop and distance <= 0.4:
+							stops_to_cluster.add(pair_candidate)
+					for stop_to_cluster in stops_to_cluster:
+						if stop_to_cluster.cluster:
+							cluster = stop_to_cluster.cluster
+							break
+					else:
+						cluster = BusStopCluster()
+						all_clusters.append(cluster)
+					for stop_to_cluster in stops_to_cluster:
+						if not stop_to_cluster.cluster:
+							cluster.add_stop(stop_to_cluster)
+			# Merkitse muistiin pysäkkien vastaparit käyttäen hyväksi tämänhetkistä ryhmittelytietoa
+			for bus_stop in bus_stops.values():
+				if bus_stop.cluster:
+					bus_stop.pairs = bus_stop.cluster.stops - {bus_stop}
+			# Ryhmitä ne bus_stops, joilla ei ollut omaa vastaparia, muiden pysäkkien kanssa
+			for bus_stop in sorted_gtfs_stops:
+				if len(bus_stop.cluster.stops) == 1:
+					possibilities = set()
+					for cluster in all_clusters:
+						if cluster is not bus_stop.cluster:
+							distance = cluster.center.distance(bus_stop.location)
+							if distance <= 0.4:
+								possibilities.add((distance, cluster))
+					if possibilities:
+						best = min(possibilities)[1]
+						all_clusters.remove(bus_stop.cluster)
+						best.merge(bus_stop.cluster)
+
+		def shared_elements_in_n_sets(sets):
+			from itertools import combinations
+			result = set()
+			for pair in combinations(sets, 2):
+				result |= pair[0] & pair[1]
+			return result
+
+		def name_clusters():
+			from collections import defaultdict
+			clusters_per_name = defaultdict(set)
+			for cluster in all_clusters:
+				name_representing_stop = min((len(stop.reference), stop.reference, stop) for stop in cluster.stops)[2]
+				clusters_per_name[name_representing_stop.name].add(cluster)
+			for name, clusters in clusters_per_name.items():
+				if len(clusters) == 1:
+					# Simple case: this cluster is the only one that wants this name.
+					next(iter(clusters)).name = name
+				else:
+					if profile['regions']['use-regions']:
+						# Find out if all clusters are in different areas
+						common_regions = shared_elements_in_n_sets({stop.region for stop in cluster.stops} for cluster in clusters)
+						# Proposal: cluster -> the areas unique to the cluster
+						proposal = {
+							cluster: {stop.region for stop in cluster.stops} - common_regions - {None}
+							for cluster in clusters
+						}
+						# If at most one cluster is without its own unique region, name the others by region and this one without any.
+						if sum([1 for unique_areas in proposal.values() if not unique_areas]) <= 1:
+							for cluster, unique_areas in proposal.items():
+								individual_cluster_name = name
+								if unique_areas:
+									individual_cluster_name += ' (' + min(unique_areas) + ')'
+								cluster.name = individual_cluster_name
+								break
+					# If all else fails, just number them.
+					for n, (_, cluster) in enumerate(sorted(
+						min((stop.reference.lower(), cluster) for stop in cluster.stops)
+						for cluster in clusters
+					), 1):
+						individual_cluster_name = name + '-' + str(n)
+						cluster.name = individual_cluster_name
+
+		print('Clustering bus stops...')
+		cluster_gtfs_stops()
+		name_clusters()
+		for cluster in all_clusters:
+			if cluster.url_name in clusters_by_name:
+				print('Warning: Clusters %r and %r share the same URL name: %r' % (cluster.name, clusters_by_name[cluster.url_name].name, cluster.url_name))
+			else:
+				clusters_by_name[cluster.url_name] = cluster
+		print('Loading schedules... ', end = '', flush = True, file = stderr)
+		with gtfs_zip.open('stop_times.txt') as file:
+			row_count = sum(line.count(b'\n') for line in file)
+		with gtfs_zip.open('stop_times.txt') as file:
+			progress = 0
+			for row in read_csv(map(bytes.decode, file)):
+				if int(row.get('pickup_type', '') or '0') and int(row.get('drop_off_type', '') or '0'):
+					continue
+				trip = all_trips[transform_trip_reference(row['trip_id'])]
+				arrival_time = read_time(row['arrival_time'])
+				departure_time = read_time(row['departure_time'])
+				stop = bus_stops[row['stop_id']]
+				traveled_distance = float(row.get('shape_dist_traveled', 1)) * float(profile['metrics']['shape-modifier'])
+				visitnumber = len(trip.schedule) + 1
+				trip.schedule.append(BusHalt(arrival_time, departure_time, stop, trip, traveled_distance, visitnumber))
+				stop.involved_trips.add(trip)
+				progress += 1
+				if progress % 1000 == 0:
+					print('\rLoading schedules... %.1f%%' % (progress * 100 / row_count), end = ' ', file = stderr)
+		print('\rLoading schedules... complete', file = stderr)
+		for trip in all_trips.values():
+			from busroute import simplify_name
+			schedule = trip.concise_schedule()
+			try:
+				trip.from_place = simplify_name(schedule[0])
+				trip.to_place = simplify_name(schedule[-1])
+			except IndexError:
+				trip.from_place = ''
+				trip.to_place = ''
+		for route in routes.values():
+			from collections import Counter
+			from busroute import simplify_name
+			tally = Counter()
+			for trip in route.trips:
+				schedule = trip.concise_schedule()
+				places = set(schedule)
+				do_add = True
+				assert type(schedule) is list
+				for candidate in tally:
+					if places.issubset(set(candidate)):
+						do_add = False
+						tally.update({tuple(candidate)})
+				if do_add:
+					tally.update({tuple(schedule)})
+			try:
+				most_common_route = tally.most_common(1)[0][0]
+				route.description = simplify_name(most_common_route[0]) + ' - ' + simplify_name(most_common_route[-1])
+			except:
+				route.description = ''
+			route.trips = sorted(route.trips, key = lambda trip: trip.schedule and trip.schedule[0].departure_time or timedelta())
+		if 'compatibility' in profile and profile['compatibility'].get('fix-destination-times', False):
+			# There seems to be something strange going on in Föli's gtfs data.
+			# It seems that sometimes the arrival time of the last stop is
+			# completely off, so try estimate when the bus will really arrive
+			# there based on the last leg distance.
+			# I noticed this for bus 220's arrival time at Mylly several years
+			# ago. Possibly this has been fixed in the data by now?
+			for trip in all_trips.values():
+				if len(trip.schedule) >= 2:
+					bus_speed_coefficient = 750 # meters per minute
+					last_leg_distance = trip.schedule[-1].traveled_distance - trip.schedule[-2].traveled_distance
+					trip.schedule[-1].arrival_time = trip.schedule[-2].departure_time + timedelta(minutes = last_leg_distance / bus_speed_coefficient)
+		# Add services to all bus stops
+		for route in routes.values():
+			for trip in route.trips:
+				for halt in trip.schedule:
+					halt.stop.services.add(route.service)