====== Wikipedia statistics benchmark ====== ===== Goals ===== There several goals of benchmark * Compare engines in OLAP queries for planning, predicting growth, analyzing access patterns to wiki pages, draw trends. * Compare engines in statistical queries for end users, which can be executed in real-time. I.e. How many times that or another page was accessed yesterday vs today. * Understand specific features and characteristic of each engine. * Compare throughput on simple queries * Check ability to load data and serve queries at the same time ( availability during data load ) ===== Introduction ===== It's important for benchmark to use real data. I am going to take statistics of access to WikiPedia pages http://lists.wikimedia.org/pipermail/wikitech-l/2007-December/035435.html Format of raw data is PROJECT PAGE ACCESSED BYTES ... aa Category:User_aa 1 6313 aa Main_Page 3 16473 aa Special:RecentChangesLinked/User:Purodha 1 5701 aa Special:Statistics 1 661 aa Talk:Main_Page 1 8092 ... I made some efforts to normalize data, final tables you can see below :!: Normalized data is available in AWS snapshot ("percona-wikistat", ID:snap-a5f9bacc) ===== Tables ===== * pages Dictionary of pages. 724.550.811 rows. data size: 40476M CREATE TABLE `pages` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `page` varchar(1024) NOT NULL, PRIMARY KEY (`id`) ) * projects Projects dictionary, small table. 2025 rows. CREATE TABLE `projects` ( `id` smallint unsigned NOT NULL AUTO_INCREMENT, `project` varchar(50) NOT NULL, PRIMARY KEY (`id`) ) * datesinfo Table with different part of date. 9624 rows. CREATE TABLE `datesinfo` ( `id` smallint unsigned NOT NULL AUTO_INCREMENT, `datestring` char(11) NOT NULL, `caldate` date NOT NULL, `calday` int(10) unsigned NOT NULL, `calmonth` tinyint(3) unsigned NOT NULL, `calyear` smallint(5) unsigned NOT NULL, `dayofweek` tinyint(3) unsigned NOT NULL, `dayhour` tinyint(3) unsigned NOT NULL, PRIMARY KEY (`id`) ) +------+-------------+------------+--------+----------+---------+-----------+---------+ | id | datestring | caldate | calday | calmonth | calyear | dayofweek | dayhour | +------+-------------+------------+--------+----------+---------+-----------+---------+ ... | 3625 | 20090601-00 | 2009-06-01 | 1 | 6 | 2009 | 2 | 0 | | 3626 | 20090601-01 | 2009-06-01 | 1 | 6 | 2009 | 2 | 1 | | 3627 | 20090601-02 | 2009-06-01 | 1 | 6 | 2009 | 2 | 2 | | 3628 | 20090601-03 | 2009-06-01 | 1 | 6 | 2009 | 2 | 3 | | 3629 | 20090601-04 | 2009-06-01 | 1 | 6 | 2009 | 2 | 4 | | 3630 | 20090601-05 | 2009-06-01 | 1 | 6 | 2009 | 2 | 5 | | 3631 | 20090601-06 | 2009-06-01 | 1 | 6 | 2009 | 2 | 6 | | 3632 | 20090601-07 | 2009-06-01 | 1 | 6 | 2009 | 2 | 7 | | 3633 | 20090601-08 | 2009-06-01 | 1 | 6 | 2009 | 2 | 8 | | 3634 | 20090601-09 | 2009-06-01 | 1 | 6 | 2009 | 2 | 9 | ... +------+-------------+------------+--------+----------+---------+-----------+---------+ * pagestat Fact table. page_count is aggregated amount (sum) of page calls per hour. Raw files per month: - 200906: 68.352M / 3.453.013.109 rows - 200907: 68.152M / 3.442.375.618 rows - 200908: 69063M - 200909: 69480M - 200910: 77.558M / 3.919.634.163 rows - 200911: 74.851M / - 200912: TBD CREATE TABLE `pagestat` ( `date_id` smallint unsigned NOT NULL, `project_id` smallint unsigned NOT NULL, `page_id` int unsigned NOT NULL, `page_count` mediumint unsigned NOT NULL, ) ==== EER Diagram ==== {{:benchmark:wikistat:wikistat.png|}} ===== Queries ===== Queries under consideration: * Q_UNIQ(Year, Month): SELECT count(distinct page_id) FROM pagestat JOIN datesinfo di ON ( di.id=date_id ) WHERE di.calmonth=6 and di.calyear=2009; * Q_DAYSTAT(Year, Month): SELECT caldate, sum(page_count) FROM pagestat JOIN datesinfo di ON ( di.id=date_id ) WHERE di.calmonth=7 and di.calyear=2009 GROUP BY caldate ORDER BY caldate; * Q_TOP_PROJECTS(N,Year,Month): SELECT project, sum(page_count) sm FROM pagestat JOIN datesinfo di ON ( di.id=date_id ) JOIN projects p ON (p.id=project_id ) WHERE di.calmonth=7 and di.calyear=2009 GROUP BY project ORDER BY sm DESC LIMIT 20; * Q_PROJECT_SUM SELECT project_id, sum(page_count) sm FROM pagestat GROUP BY project_id; * Q_HOURPROJECTSTAT(project,Year,Month,Day): SELECT dayhour, sum(page_count) sm FROM pagestat JOIN datesinfo di ON ( di.id=date_id ) JOIN projects p ON (p.id=project_id ) WHERE di.caldate='2009-07-21' AND project='es' GROUP BY dayhour; * Q_NOPROJECTS(Year,Month,Day) SELECT project FROM projects LEFT OUTER JOIN pagestat ON (pagestat.project_id = projects.id) WHERE pagestat.project_id IS NULL ORDER BY project; * Q_NOPROJECTS_1(Year,Month,Day): SELECT project FROM projects LEFT OUTER JOIN (SELECT distinct project_id pid FROM pagestat WHERE date_id IN (SELECT id FROM datesinfo WHERE calmonth=7 and calyear=2009 AND calday=14 )) t1 ON (projects.id=pid) WHERE pid IS NULL ORDER BY project; * Q_DAYWEEKSTAT(project,Year,Month) SELECT dayofweek, sum(page_count) sm FROM pagestat JOIN datesinfo di ON ( di.id=date_id ) JOIN projects p ON (p.id=project_id ) WHERE di.calmonth=6 and di.calyear=2009 AND project='de' GROUP BY dayofweek; * Q_TOP50(project,Year,Month,Day) SELECT page, sum(page_count) sm FROM pagestat JOIN datesinfo di ON ( di.id=date_id ) JOIN projects p ON (p.id=project_id ) JOIN pages pp ON (pp.id=page_id) WHERE di.calmonth=6 and di.calyear=2009 AND di.calday=18 AND project='de' GROUP BY page ORDER BY sm DESC LIMIT 50; * Q_MAXPAGE(Year,Month,Day) SELECT page_id, max(page_count) mx FROM pagestat JOIN datesinfo di ON ( di.id=date_id ) WHERE di.calmonth=6 and di.calyear=2009 AND di.calday=18 GROUP BY page_id ORDER BY mx DESC LIMIT 50; * Q_10000HITS_1(Year,Month,Day) - found pages with more than 10000 hits per hour SELECT page_id, max(page_count) mx FROM pagestat JOIN datesinfo di ON ( di.id=date_id ) WHERE di.calmonth=7 and di.calyear=2009 AND di.calday=18 GROUP BY page_id HAVING max(page_count) > 10000 LIMIT 50; * Q_10000HITS_2(Year,Month,Day) - found pages with more than 10000 hits per hour SELECT page,mx FROM pages JOIN (SELECT page_id, max(page_count) mx FROM pagestat JOIN datesinfo di ON ( di.id=date_id ) WHERE di.calmonth=7 and di.calyear=2009 AND di.calday=18 GROUP BY page_id HAVING max(page_count) > 10000 LIMIT 50) t ON (t.page_id=pages.id); * Q_NOACCESS_SECOND_DAY(project_id, Year, Month, Dday): Find pages accessed more 1000 times one day and not accessed another SELECT page FROM pages JOIN (SELECT t1.page_id FROM (SELECT p1.page_id,sum(page_count) sm FROM pagestat p1 JOIN datesinfo di1 ON ( di1.id=p1.date_id ) WHERE di1.calmonth=6 and di1.calyear=2009 AND di1.calday=18 AND p1.project_id=193 GROUP BY p1.page_id HAVING sum(page_count)>1000) t1 LEFT OUTER JOIN (SELECT p2.page_id FROM pagestat p2 JOIN datesinfo di2 ON ( di2.id=p2.date_id ) WHERE di2.calmonth=6 and di2.calyear=2009 AND di2.calday=19 AND p2.project_id=193) t2 ON (t2.page_id=t1.page_id) WHERE t2.page_id IS NULL) bq ON (pages.id=page_id); * Q_PAGE_MONTHSTATS(page,Year, Month) SELECT caldate, sum(page_count) FROM pagestat JOIN datesinfo ON ( pagestat.date_id = datesinfo.id) JOIN pages ON (pagestat.page_id=pages.id) WHERE page='Yugopolis' AND calmonth=7 and calyear=2009 GROUP by caldate ORDER BY caldate; * Q_PROJECT_AVG(Year, Month) SELECT project, AVG(page_count) FROM pagestat JOIN datesinfo ON ( pagestat.date_id = datesinfo.id) JOIN projects ON (pagestat.project_id=projects.id) WHERE calmonth=7 AND calyear=2009 GROUP by project; * Q_PAGES_PROJECT(Year,Month) Found pages that belong to several projects SELECT page_id, count(distinct project_id) FROM pagestat JOIN datesinfo di ON ( di.id=date_id ) WHERE di.calmonth=7 and di.calyear=2009 GROUP BY page_id HAVING count(distinct project_id) > 10 LIMIT 100; * Q_HOURS_DROP(Date, Hour, project_id): Find page that one hour was accessed two times more than next hour SELECT ps1.page_id, ps1.page_count FROM pagestat ps1 JOIN datesinfo di1 ON ( ps1.date_id = di1.id) JOIN pagestat ps2 ON (ps2.page_id=ps1.page_id) JOIN datesinfo di2 ON ( ps2.date_id = di2.id) WHERE di1.caldate='2009-07-20' AND di1.dayhour=14 AND di2.caldate='2009-07-20' AND di2.dayhour=15 AND ps1.project_id=193 AND ps2.project_id=193 AND ps1.page_count > 2 * ps2.page_count AND ps1.page_count>=1000 LIMIT 100; * Build summary tables with per-day statistics CREATE TABLE pagestat_daily ( page_id int NOT NULL, date_id_from smallint NOT NULL, date_id_to smallint NOT NULL, project_id smallint NOT NULL, page_count int NOT NULL ); INSERT INTO pagestat_daily (project_id, page_id, date_id_from, date_id_to, page_count ) SELECT project_id, page_id, min(date_id), max(date_id), sum(page_count) FROM pagestat JOIN datesinfo ON (date_id=id) WHERE calmonth=6 AND calyear=2009 GROUP BY project_id, page_id,caldate; * Q_DATE_RANGE - date range query on summary table SELECT project, sum(page_count) sm FROM pagestat_daily JOIN datesinfo dstart ON ( date_id_from=dstart.id) JOIN datesinfo dend ON (date_id_to=dend.id) JOIN projects ON (project_id=projects.id) WHERE dstart.caldate>='2009-06-15' AND dend.caldate <= '2009-06-20' GROUP BY project HAVING sum(page_count) > 1000 ORDER BY sm DESC; * Q_PAGE_DATE_RANGE - date range on summary table per page SELECT page, sm FROM pages, ( SELECT page_id, sum(page_count) sm FROM pagestat_daily JOIN datesinfo dstart ON ( date_id_from=dstart.id) JOIN datesinfo dend ON (date_id_to=dend.id) WHERE dstart.caldate>='2009-06-15' AND dend.caldate <= '2009-06-20' GROUP BY page_id ) dt WHERE pages.id=dt.page_id ORDER BY sm DESC LIMIT 100; * Q_HIT_SPIKE - show max per hour spikes SELECT ps.page_id, MAX(ps.page_count/pd.page_count) AS max_daily_spikiness FROM pagestat ps INNER JOIN pagestat_daily pd ON ps.page_id=pd.page_id AND ps.date_id BETWEEN pd.date_id_from AND pd.date_id_to GROUP BY ps.page_id ORDER BY max_daily_spikiness DESC LIMIT 10; * QC_POINT_FACT Retrieve specific page on specific date from fact table SELECT * FROM pagestat ps1 JOIN datesinfo di1 ON ( ps1.date_id = di1.id) WHERE page_id=64193807 AND project_id=193 AND di1.calday=20 AND di1.calmonth=7 AND di1.calyear=2009 AND di1.dayhour=15; * QC_POINT_PAGE(page_id) SELECT * FROM pages WHERE id=253608709; * Retrieve stats for page / project for last hour ===== Engines ===== Engines in consideration: * [[ http://www.percona.com/docs/wiki/benchmark:wikistat:infobright | InfoBright, ICE 3.3 ]] * [[ http://www.percona.com/docs/wiki/benchmark:wikistat:infinidb | InfiniDB, 0.9.5.4-beta ]] * MonetDB, Nov-2010 * LucidDB * [[ http://www.percona.com/docs/wiki/benchmark:wikistat:greenplum | Greenplum, 3.3.4 ]] ==== MonetDB ==== === Tables === CREATE TABLE pages ( id int NOT NULL , page varchar(1024) NOT NULL ) COPY 730000000 records INTO pages FROM '/data/vol2/wikistat/in/pages.txt'; === Notes === Install pcre-6.6 with UTF-8 support ( http://www.fredsantos.net/index.php?option=com_content&view=article&id=114:unicode-support-on-centos-52-with-php-and-pcre&catid=36:linux&Itemid=77 ) rpm -ivh http://mirrors.kernel.org/centos/5.4/os/SRPMS/pcre-6.6-2.el5_1.7.src.rpm change to %configure --enable-utf8 --enable-unicode-properties in pcre.spec or rpm -ivh http://gaarai.com/wp-content/uploads/2009/01/pcre-66-27x86_64.rpm ==== Greenplum ==== /etc/sysctl.conf kernel.shmmax=500000000 kernel.shmmni=4096 kernel.shmall=4000000000 kernel.sem=250 64000 100 512 net.ipv4.tcp_tw_recycle=1 === Tables === CREATE TABLE pages ( id int NOT NULL PRIMARY KEY, page varchar(1024) NOT NULL ); CREATE TABLE pagestat ( date_id smallint NOT NULL, project_id smallint NOT NULL, page_id int NOT NULL, page_count int NOT NULL ) WITH (appendonly=true, orientation=column, compresslevel=1, compresstype=QuickLZ); gpfdist -d /mnt/data/ -p 8081 -l /home/gpadmin/log CREATE EXTERNAL TABLE ext_pages ( id int, page varchar(1024) ) LOCATION ('gpfdist://localhost:8081/pages.txt') FORMAT 'TEXT' (DELIMITER '|'); CREATE EXTERNAL TABLE ext_pagestat ( date_id smallint , project_id smallint, page_id int ,page_count int ) LOCATION ('gpfdist://localhost:8081/200906.out') FORMAT 'TEXT' (DELIMITER '|');