forked from Kaggle/learntools
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathex2.py
More file actions
103 lines (88 loc) · 4.33 KB
/
Copy pathex2.py
File metadata and controls
103 lines (88 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
from google.cloud import bigquery
from learntools.core import *
# Setup (2.42s on Kaggle)
client = bigquery.Client()
# (1) WhichCountries
first_query = """
SELECT country
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE unit = "ppm"
"""
first_query_job = client.query(first_query)
first_results_answer = first_query_job.to_dataframe()
# (2) ZeroPollution
zero_pollution_query = """
SELECT *
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE value = 0
"""
zero_pollution_query_job = client.query(zero_pollution_query)
zero_pollution_answer = zero_pollution_query_job.to_dataframe()
class WhichCountries(CodingProblem):
_vars = ['first_query', 'first_results']
def check(self, query, results):
# check 1: words appear in query
assert ("`bigquery-public-data.openaq.global_air_quality`" in query), \
('Your query should be pulling data FROM `bigquery-public-data.openaq.global_air_quality`.')
# check 2: column names
lowered_colnames = [c.lower() for c in results.columns]
assert ('country' in results.columns), ("You didn't select the `country` column. Try again.")
# check 3: values in dataframe
assert (set(results.country) == set(first_results_answer.country)), ("You have the wrong set of countries. Check your **WHERE** clause.")
assert (len(results.columns) == 1), ("Nice job. You selected the right countries, but you selected other columns too. "
"See if you can select `country` without other columns.")
if len(results) == len(first_results_answer):
print("You got the right countries. Nice job! Some countries showed up many times in the results. "
"To get each country only once you can run `SELECT DISTINCT country ...`. "
"The DISTINCT keyword ensures each column shows up once, which you'll want in some cases.")
_solution = CS(\
"""
first_query = \"""
SELECT country
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE unit = "ppm"
\"""
# Or to get each country just once, you could use
first_query = \"""
SELECT DISTINCT country
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE unit = "ppm"
\"""
"""
)
class ZeroPollution(CodingProblem):
_vars = ['zero_pollution_query', 'zero_pollution_results']
def check(self, query, results):
# check 1: query contains certain words
assert (type(query) == str), ("You don't have a valid query yet. Try again.")
assert ('select' in query.lower()), ("You don't have a valid query yet. Try again.")
assert ('from' in query.lower()), ("You don't have a valid query yet. Try again.")
assert ('value' in query.lower()), ("You don't have the right **WHERE** clause yet. Try again.")
assert ('`bigquery-public-data.openaq.global_air_quality`' in query.lower()), \
("You should be selecting the data `FROM bigquery-public-data.openaq.global_air_quality`.")
# check 2: correct columns selected
assert ('value' in set(results.columns)), ("You didn't select the `value` column.")
assert (set(results.columns) == set(zero_pollution_answer.columns)), ("Please select **all columns** of the rows where pollution levels were reported to be exactly 0.")
# check 3: check values, length of dataframe
assert (all(results["value"]== 0.0)), ("Some rows have a pollution level that is greater than 0.")
assert(len(results)==len(zero_pollution_answer)), ("The results don't look right. Try again.")
_solution = CS( \
"""
zero_pollution_query = \"""
SELECT *
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE value = 0
\"""
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(zero_pollution_query, job_config=safe_config)
zero_pollution_results = query_job.to_dataframe()
"""
)
qvars = bind_exercises(globals(), [
WhichCountries,
ZeroPollution,
],
var_format='q_{n}',
)
__all__ = list(qvars)