Moving data from sqlalchemy to a pandas DataFrame
Simply add an __init__
method in your model and call the Class object before dataframe build. Specifically below creates an iterable of tuples binded into columns with pandas.DataFrame()
.
class LPRRank(db.Model):
id = db.Column(db.Integer, primary_key=True)
candid = db.Column(db.String(40), index=True, unique=False)
rank = db.Column(db.Integer, index=True, unique=False)
user_id = db.Column(db.Integer, db.ForeignKey('lprvote.id'))
def __init__(self, candid=None, rank=None, user_id=None):
self.data = (candid, rank, user_id)
def __repr__(self):
return (self.candid, self.rank, self.user_id)
data = db.session.query(LPRRank).all()
df = pd.DataFrame([(d.candid, d.rank, d.user_id) for d in data],
columns=['candid', 'rank', 'user_id'])
Alternatively, use the SQLAlchemy ORM based on your defined Model class, LPRRank, to run read_sql
:
df = pd.read_sql(sql = db.session.query(LPRRank)\
.with_entities(LPRRank.candid,
LPRRank.rank,
LPRRank.user_id).statement,
con = db.session.bind)
The Parfait answer is good but could have to problems:
- efficiency each object creation imply duplication of data into a DataFrame, so a list of dataframe could take time to be created
- That do not mirror a dataframe with a collection of row
Thus below example provides a parent
class which is assimilated to a DataFrame representation and a child
class assimilated to row of a given dataframe.
Code below provides two way to get a dataframe, the DataFrame object is created only at demand to not waste cpu and memory.
If dataframe is need at creation time you have only to add constructor (def __init__(self, rows:List[MyDataFrameRow] = None)...
) and create a new attribute and assing the result of self.data_frame
.
from pandas import DataFrame, read_sql
from sqlalchemy import Column, Integer, String, Float, ForeignKey
from sqlalchemy.orm import relationship, Session
Base = declarative_base()
class MyDataFrame(Base):
__tablename__ = 'my_data_frame'
id = Column(Integer, primary_key=True)
rows = relationship('MyDataFrameRow', cascade='all,delete')
@property
def data_frame(self) -> DataFrame:
columns = GenomeCoverageRow.data_frame_columns()
return DataFrame([[getattr(row, column) for column in columns] for row in self.rows],
columns=columns)
@staticmethod
def to_data_frame(identifier: int, session: Session) -> DataFrame:
query = session.query(MyDataFrameRow).join(MyDataFrame).filter(MyDataFrame.id == identifier)
return read_sql(query.statement, session.get_bind())
class MyDataFrameRow(Base):
__tablename__ = 'my_data_row'
id = Column(Integer, primary_key=True)
name= Column(String)
age= Column(Integer)
number_of_children = Column(Integer)
height= Column(Integer)
parent_id = Column(Integer, ForeignKey('my_data_frame.id'))
@staticmethod
def data_frame_columns() -> Tuple[Any]:
return tuple(column.name for column in GenomeCoverageRow.__table__.columns if len(column.foreign_keys) == 0
and column.primary_key is False)
...
session = Session(...)
df1 = MyDataFrame.to_data_frame(1,session)
my_table_obj = session.query(MyDataFrame).filter(MyDataFrame.id == 1).one()
df2 = my_table_obj.data_frame