如何使用大数据优化此查询

时间:2017-04-03 06:00:56

标签: sql postgresql optimization

select ticket_type,f_rows.remaining_uses,t.source,count(t.id) as total
FROM (
    -- Filter rows to get those where remaining_uses > 0 and status = 1 
    SELECT * FROM (
        --Get all the latest rows for each ticket
        SELECT ticket_id,final_remaining_uses AS remaining_uses,final_status AS status,action_when 
        FROM TicketHistory th
        INNER JOIN (SELECT max(th.id) AS id FROM TicketHistory GROUP BY ticket_id) maxid ON th.id = maxid.id 
    ) latest_rows
    WHERE remaining_uses > 0 AND status = 1 --and (action_when < current_date and action_when > current_date -30)
) f_rows 
INNER JOIN Ticket t ON f_rows.ticket_id = t.id
WHERE t.expiry_date >= current_date -1 and source in (0,1,2,6,7,8) and (created_date < current_date and created_date > current_date - 30)


GROUP BY ticket_type, f_rows.remaining_uses, t.source
order by source, ticket_type, remaining_uses;

我在这里做的是从历史记录表中获取每张票的最新行。 然后过滤未激活的票证的行,并且该票证上没有剩余使用量。 然后用有效期和其他检查过滤数据

有没有办法优化此查询?目前这个查询需要很长时间才能在postgresql崩溃之前返回任何数据。

票证和票证历史记录各有超过11M行。

修改

CREATE TABLE ticket
(
  id serial NOT NULL,
  source integer NOT NULL,
  status integer NOT NULL,
  ticket_type integer NOT NULL,
  remaining_uses integer NOT NULL,
  expiry_date timestamp with time zone NOT NULL,
  price numeric(20,2) NOT NULL,
  created_date timestamp with time zone NOT NULL,
  pax_type integer NOT NULL,
  last_updated timestamp with time zone NOT NULL,
  service integer,
  client_id character varying(50),
  CONSTRAINT skybus_ticket_pkey PRIMARY KEY (id),
  CONSTRAINT skybus_ticket_sale_id_fkey FOREIGN KEY (sale_id)
      REFERENCES skybus_sale (id) MATCH SIMPLE
      ON UPDATE NO ACTION ON DELETE NO ACTION DEFERRABLE INITIALLY DEFERRED
)
WITH (
  OIDS=FALSE
);
ALTER TABLE ticket
  OWNER TO umd;

-- Index: ticket_client_id_idx

-- DROP INDEX ticket_client_id_idx;

CREATE INDEX ticket_client_id_idx
  ON ticket
  USING btree
  (client_id COLLATE pg_catalog."default");

-- Index: ticket_profile_id_idx

-- DROP INDEX ticket_profile_id_idx;

CREATE INDEX ticket_profile_id_idx
  ON ticket
  USING btree
  (profile_id);

-- Index: ticket_sale_id

-- DROP INDEX ticket_sale_id;

CREATE INDEX skybus_ticket_sale_id
  ON ticket
  USING btree
  (sale_id);

-- Index: ticket_ticket_number

-- DROP INDEX ticket_ticket_number;

CREATE INDEX ticket_ticket_number
  ON ticket
  USING btree
  (ticket_number COLLATE pg_catalog."default");

-- Index: ticket_ticket_number_like

-- DROP INDEX ticket_ticket_number_like;

CREATE INDEX ticket_ticket_number_like
  ON ticket
  USING btree
  (ticket_number COLLATE pg_catalog."default" varchar_pattern_ops);

-- Index: ticket_topup_for_idx

-- DROP INDEX ticket_topup_for_idx;

CREATE INDEX ticket_topup_for_idx
  ON ticket
  USING btree
  (topup_for COLLATE pg_catalog."default");

- ===============================

CREATE TABLE tickethistory
(
  id serial NOT NULL,
  ticket_id integer,
  action integer NOT NULL,
  action_result integer NOT NULL,
  initial_status integer NOT NULL,
  final_status integer NOT NULL,
  final_remaining_uses integer NOT NULL,
  ticket_type integer NOT NULL,
  action_when timestamp with time zone NOT NULL,
  last_updated timestamp with time zone NOT NULL,
  service integer,
  CONSTRAINT tickethistory_pkey PRIMARY KEY (id),
  CONSTRAINT tickethistory_ticket_id_fkey FOREIGN KEY (ticket_id)
      REFERENCES ticket (id) MATCH SIMPLE
      ON UPDATE NO ACTION ON DELETE NO ACTION DEFERRABLE INITIALLY DEFERRED
)
WITH (
  OIDS=FALSE
);
ALTER TABLE tickethistory
  OWNER TO umd;

-- Index: tickethistory_ticket_id

-- DROP INDEX tickethistory_ticket_id;

CREATE INDEX tickethistory_ticket_id
  ON tickethistory
  USING btree
  (ticket_id);

- =====执行计划 - 这是使用row_number()更改

    "HashAggregate  (cost=4526158.63..4526158.64 rows=1 width=16) (actual time=382849.323..382849.376 rows=41 loops=1)"
"  ->  Nested Loop  (cost=3880592.94..4526158.62 rows=1 width=16) (actual time=380338.613..382825.688 rows=11745 loops=1)"
"        ->  Subquery Scan on sub  (cost=3880592.94..4463424.47 rows=6563 width=8) (actual time=126346.043..258837.523 rows=293717 loops=1)"
"              Filter: ((sub.remaining_uses > 0) AND (sub.rn = 1) AND (sub.status = 1))"
"              Rows Removed by Filter: 15244064"
"              ->  WindowAgg  (cost=3880592.94..4191436.42 rows=15542174 width=203) (actual time=126345.775..237172.180 rows=15537781 loops=1)"
"                    ->  Sort  (cost=3880592.94..3919448.38 rows=15542174 width=203) (actual time=126345.757..180461.191 rows=15537781 loops=1)"
"                          Sort Key: th.ticket_id, th.*"
"                          Sort Method: external merge  Disk: 3050616kB"
"                          ->  Seq Scan on skybus_tickethistory th  (cost=0.00..483544.74 rows=15542174 width=203) (actual time=14.091..53312.782 rows=15537781 loops=1)"
"        ->  Index Scan using skybus_ticket_pkey on skybus_ticket t  (cost=0.00..9.55 rows=1 width=12) (actual time=0.418..0.418 rows=0 loops=293717)"
"              Index Cond: (id = sub.ticket_id)"
"              Filter: ((source = ANY ('{0,1,2,6,7,8}'::integer[])) AND (created_date < ('now'::cstring)::date) AND (expiry_date >= (('now'::cstring)::date - 1)) AND (created_date > (('now'::cstring)::date - 30)) AND (ticket_type = ANY ('{2,3,4,5,6,7,16,17, (...)"
"              Rows Removed by Filter: 1"
"Total runtime: 383045.381 ms"

2 个答案:

答案 0 :(得分:2)

您可以使用row_number()一次性获取每张票的最新行:

with    last_history as
        (
        select  *
        from    (
                select  row_number() over (partition by ticket_id
                                           order by th desc) rn
                ,       *
                from    TicketHistory
                ) sub
        where   rn = 1 -- Latest history row only
        )
select  *
from    ticket t
join    th
on      t.id = th.ticket_id
where   remaining_uses > 0
        and <... other conditions ...>

答案 1 :(得分:1)

distinct on ()通常是解决Postgres中问题的最快方法:

select ticket_type,f_rows.remaining_uses,t.source,count(t.id) as total
FROM (
    -- Filter rows to get those where remaining_uses > 0 and status = 1 
    SELECT * 
    FROM (
        --Get all the latest rows for each ticket
        SELECT distinct on (ticket_id) 
               ticket_id, 
               final_remaining_uses AS remaining_uses, 
               final_status AS status, action_when 
        FROM TicketHistory th
        order by ticket_id, id desc
    ) latest_rows
    WHERE remaining_uses > 0 
      AND status = 1 --and (action_when  current_date -30)
) f_rows 
  JOIN Ticket t ON f_rows.ticket_id = t.id
WHERE t.expiry_date >= current_date -1 
  and source in (0,1,2,6,7,8) 
  and created_date  current_date - 30
GROUP BY ticket_type, f_rows.remaining_uses, t.source
order by source, ticket_type, remaining_uses;

distinct on()order by一起返回每个tickethistory.id的{​​{1}}值最高的行。

ticket_id上的索引可能有所帮助。也许甚至可以在tickethistory (ticket_id, id desc)上启用仅索引扫描。

但是,存储创建时刻的时间戳列可能更准确。如果是tickethistory (ticket_id, id desc, final_remaining_uses, final_status, action_when),例如通过序列生成id(因为它是tickethistory.id),那么这些值可能无法反映实际的插入顺序。