# https://rajasgs.gitbook.io/pynotes/gcp#data-modeling
# https://github.com/tactlabs/student-hiring-prediction-mle/tree/main/dataset
import pyutil as pyu
pyu.get_local_pyinfo()
'conda env: ml311; pyv: 3.11.10 (main, Oct 3 2024, 07:29:13) [GCC 11.2.0]'
!pip show duckdb | grep "Version:"
# Connect to DuckDB
con = duckdb.connect()
# Load CSV file
con.execute("CREATE TABLE automobile AS SELECT * FROM '../dataset/automobile_data.csv'")
<duckdb.duckdb.DuckDBPyConnection at 0x7f2df2f47770>
# Perform SQL query
result = con.execute("SELECT * FROM automobile").fetchall()
import pandas as pd
from IPython.display import display
def list2table(cdata, col_list = None):
first_row = cdata[0]
cols_count = len(first_row)
if not col_list:
col_list = []
for idx in range(cols_count):
col_list.append(f'col_{idx}')
# Convert the list of tuples to a DataFrame
df1 = pd.DataFrame(cdata, columns = col_list)
# Set display options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
# Apply custom styling
styled_df = df1.style.set_properties(**{'text-align': 'center'})
# Display the DataFrame
display(styled_df)
def q(query, col_list = None):
result = con.execute(query).fetchall()
# print(result)
return list2table(result, col_list)
q("SELECT * FROM automobile limit 2")
| |
col_0 |
col_1 |
col_2 |
col_3 |
col_4 |
col_5 |
col_6 |
col_7 |
col_8 |
col_9 |
col_10 |
col_11 |
col_12 |
col_13 |
col_14 |
col_15 |
col_16 |
col_17 |
col_18 |
col_19 |
col_20 |
col_21 |
col_22 |
col_23 |
col_24 |
col_25 |
| 0 |
3 |
? |
alfa-romero |
gas |
std |
two |
convertible |
rwd |
front |
88.600000 |
168.800000 |
64.100000 |
48.800000 |
2548 |
dohc |
four |
130 |
mpfi |
3.47 |
2.68 |
9.000000 |
111 |
5000 |
21 |
27 |
13495 |
| 1 |
3 |
? |
alfa-romero |
gas |
std |
two |
convertible |
rwd |
front |
88.600000 |
168.800000 |
64.100000 |
48.800000 |
2548 |
dohc |
four |
130 |
mpfi |
3.47 |
2.68 |
9.000000 |
111 |
5000 |
21 |
27 |
16500 |
q("""
SELECT
make, "num-of-doors", COUNT(*) AS door_count
FROM automobile
WHERE "num-of-doors" IN ('two', 'four')
GROUP BY
make,
"num-of-doors"
LIMIT 2
""")
| |
col_0 |
col_1 |
col_2 |
| 0 |
audi |
two |
2 |
| 1 |
chevrolet |
two |
2 |
q("""
SELECT *
FROM automobile
WHERE
CAST(horsepower as INT) > 150
and horsepower != '?'
limit 2
""")
| |
col_0 |
col_1 |
col_2 |
col_3 |
col_4 |
col_5 |
col_6 |
col_7 |
col_8 |
col_9 |
col_10 |
col_11 |
col_12 |
col_13 |
col_14 |
col_15 |
col_16 |
col_17 |
col_18 |
col_19 |
col_20 |
col_21 |
col_22 |
col_23 |
col_24 |
col_25 |
| 0 |
1 |
? |
alfa-romero |
gas |
std |
two |
hatchback |
rwd |
front |
94.500000 |
171.200000 |
65.500000 |
52.400000 |
2823 |
ohcv |
six |
152 |
mpfi |
2.68 |
3.47 |
9.000000 |
154 |
5000 |
19 |
26 |
16500 |
| 1 |
0 |
? |
audi |
gas |
turbo |
two |
hatchback |
4wd |
front |
99.500000 |
178.200000 |
67.900000 |
52.000000 |
3053 |
ohc |
five |
131 |
mpfi |
3.13 |
3.4 |
7.000000 |
160 |
5500 |
16 |
22 |
? |
q("""
SELECT
‘fuel-type’,
make,
COUNT(*) AS number_cars
FROM automobile
WHERE
'fuel-type' IN ('gas', 'diesel')
GROUP BY make, 'fuel-type'
""")
---------------------------------------------------------------------------
BinderException Traceback (most recent call last)
Cell In[35], line 1
----> 1 q("""
2 SELECT
3 ‘fuel-type’,
4 make,
5 COUNT(*) AS number_cars
6 FROM automobile
7 WHERE
8 'fuel-type' IN ('gas', 'diesel')
9
10 GROUP BY make, 'fuel-type'
11 """)
Cell In[19], line 3, in q(query, col_list)
1 def q(query, col_list = None):
----> 3 result = con.execute(query).fetchall()
4 # print(result)
6 return list2table(result, col_list)
BinderException: Binder Error: Referenced column "‘fuel" not found in FROM clause!
Candidate bindings: "automobile.fuel-type", "automobile.fuel-system"
LINE 3: ‘fuel-type’,
^
Score: 20