The Graph Structure in the Web -- Analyzed on Different Aggregation Levels
DOI:
https://doi.org/10.1561/106.00000003Abstract
Knowledge about the general graph structure of the World Wide Web is important for understanding the social mechanisms that govern its growth, for designing ranking methods, for devising better crawling algorithms, and for creating accurate models of its structure. In this paper, we analyze a large web graph. The graph was extracted from a large publicly accessible web crawl that was gathered by the Common Crawl Foundation in 2012. The graph covers over 3:5 billion web pages and 128:7 billion links. We analyse
and compare, among other features, degree distributions, connectivity, average distances, and the structure of weakly/strongly connected components.
We conduct our analysis on three different levels of aggregation: page, host, and pay-level domain (PLD) (one “dot level†above public sufï¬xes). Our analysis shows that, as evidenced by previous research , some of the features previously observed by Broder et al.  are very dependent on artifacts of the crawling process, whereas other appear to be more structural. We conï¬rm the existence of a giant strongly connected component; we however ï¬nd, as observed by other researchers, very different proportions of nodes that can reach or that can be reached from the giant component, suggesting that the “bow-tie structure†as described in is strongly dependent on the crawling process, and to the best of our current knowledge is not a structural property of the web.
More importantly, statistical testing and visual inspection of sizerank plots show that the distributions of indegree, outdegree and sizes of strongly connected components of the page and host graph are not power laws, contrarily to what was previously reported for much smaller crawls, although they might be heavy tailed. If we aggregate at pay-level domain, however, a power law emerges. We  also provide for the ï¬rst time accurate measurement of distancebased features, using recently introduced algorithms that scale to the size of our crawl.
References
@INPROCEEDINGS{Baeza-Yates2003,
author = {Baeza-Yates, R. and Poblete, B.},
title = {Evolution of the {Chilean} Web structure composition},
booktitle = {Proc. of Latin American Web Conference 2003},
year = {2003},
pages = {11-13},
doi = {10.1109/LAWEB.2003.1250276},
url = {http://www.cwr.cl/la-web/2003/stamped/02_baeza-yates-poblete.pdf}
}
@article{Baeza-Yates2007,
author = {Baeza-Yates, Ricardo and Castillo, Carlos and Efthimiadis, Efthimis N.},
title = {Characterization of National Web Domains},
journal = {ACM Trans. Internet Technol.},
issue_date = {May 2007},
volume = {7},
number = {2},
month = may,
year = {2007},
issn = {1533-5399},
articleno = {9},
url = {http://doi.acm.org/10.1145/1239971.1239973},
doi = {10.1145/1239971.1239973},
acmid = {1239973},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Web characterization, Web measurement},
}
@ARTICLE{AlstottJ2014,
author = {Alstott J, Bullmore E, Plenz D},
title = {powerlaw: A Python Package for Analysis of Heavy-Tailed Distributions},
journal = {PLoS ONE},
year = {2014},
volume = {9},
doi = {10.1371/journal.pone.0085777},
owner = {Robert Meusel},
timestamp = {2014.07.21},
url = {http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0085777}
}
@ARTICLE{Laura2003,
author = {Laura, Luigi and Leonardi, Stefano and Millozzi, Stefano and Meyer,
Ulrich and Sibeyn, JopF.},
title = {Algorithms and Experiments for the Webgraph},
year = {2003},
volume = {2832},
pages = {703-714},
booktitle = {Algorithms - ESA 2003},
doi = {10.1007/978-3-540-39658-1_63},
editor = {Di Battista, Giuseppe and Zwick, Uri},
file = {:C:UsersRobertWifoVLiteratureAlgorithms and experiments for the webgraph.pdf:PDF},
isbn = {978-3-540-20064-2},
language = {English},
owner = {Robert Meusel},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
timestamp = {2014.07.21},
url = {http://dx.doi.org/10.1007/978-3-540-39658-1_63}
}
@ARTICLE{Malevergne,
author = {Malevergne, Yannick and Pisarenko, V. and Sornette, D.},
title = {Gibrat’s law for cities: uniformly most powerful unbiased test of the Pareto against the lognormal},
year = {2009},
journal = {Swiss Finance Institute Research Paper Series},
volume = {09-40},
institution = {Swiss Finance Institute},
keywords = {City sizes; Gibrat’s law; Zipf’s law.},
owner = {Robert Meusel},
timestamp = {2014.07.21},
type = {Swiss Finance Institute Research Paper Series},
url = {http://EconPapers.repec.org/RePEc:chf:rpseri:rp0940}
}
@INPROCEEDINGS{Meusel2014,
author = {Meusel, Robert and Vigna, Sebastiano and Lehmberg, Oliver and Bizer,
Christian},
title = {Graph structure in the web - revisited: a trick of the heavy tail},
booktitle = {Proceedings of the companion publication of the 23rd international
conference on World wide web companion},
year = {2014},
pages = {427--432},
organization = {International World Wide Web Conferences Steering}
}
@inproceedings{Lehmberg2014,
title={Graph structure in the web: aggregated by pay-level domain},
author={Lehmberg, Oliver and Meusel, Robert and Bizer, Christian},
booktitle={Proceedings of the 2014 ACM conference on Web science},
pages={119--128},
year={2014},
organization={ACM}
}
@article{Pandurangan2002,
author = {Pandurangan, G and Raghavan, P and Upfal, E},
journal = {Computing and Combinatorics},
keywords = {494-0739,494-0916,765,correspond-,cs,department of computer science,e-mail,edu,fax,gopal,graph structure,in 47907-2066,ing author,pagerank,ph,power law,purdue,purdue university,usa,web measurement,web models,west lafayette},
mendeley-groups = {Masterthesis/Power law},
pages = {330--339},
title = {{Using Pagerank to characterize web structure}},
url = {http://link.springer.com/chapter/10.1007/3-540-45655-4_36},
year = {2002}
}
@ARTICLE{Malevergne2005,
author = {Malevergne , Y. and Pisarenko, V. and Sornette, D.},
title = {Empirical distributions of stock returns: between the stretched exponential and the power law?},
journal = {Quantitative Finance},
year = {2005},
volume = {5},
pages = {379-401},
number = {4},
doi = {10.1080/14697680500151343},
owner = {Robert Meusel},
timestamp = {2014.07.21},
url = { http://dx.doi.org/10.1080/14697680500151343}
}
@ARTICLE{Baringhaus1996,
author = {Ludwig Baringhaus},
title = {Fibonacci numbers, Lucas numbers
and integrals of certain Gaussian processes},
journal = {Proc. Amer. Math. Soc.},
year = {1996},
volume = {124},
pages = {3875--3884},
abstract = {We study the distributions of integrals of Gaussian processes arising
as limiting distributions of test statistics proposed for treating
a goodness of fit or symmetry problem. We show that the cumulants
of the distributions can be expressed in terms of Fibonacci numbers
and Lucas numbers.},
dcr_pdf_hash = {775E2569174FAEB6039BBA260608C4E475AD712934C1928B184C23F7A51A82},
keywords = {Gaussian processes, Fibonacci numbers, Lucas numbers, integral equations,
empirical Fourier transform, testing for normality, testing for symmetry},
owner = {Robert Meusel},
timestamp = {2013.12.10},
url = {http://www.ams.org/journals/proc/1996-124-12/S0002-9939-96-03691-X/home.html}
}
@BOOK{Vajda2008,
title = {Fibonacci and Lucas numbers, and the golden section : theory and
applications},
publisher = {Mineola, N.Y. : Dover Publications},
year = {2008},
author = {Steven Vajda},
owner = {Robert Meusel},
timestamp = {2013.12.10},
}
@article{Clauset2009,
author = {Clauset, Aaron and Shalizi, Cosma Rohilla and Newman, M. E. J.},
title = {Power-Law Distributions in Empirical Data},
journal = {SIAM Rev.},
issue_date = {November 2009},
volume = {51},
number = {4},
month = nov,
year = {2009},
issn = {0036-1445},
pages = {661--703},
numpages = {43},
url = {http://dx.doi.org/10.1137/070710111},
doi = {10.1137/070710111},
acmid = {1655789},
publisher = {Society for Industrial and Applied Mathematics},
address = {Philadelphia, PA, USA},
keywords = {Pareto, Zipf, heavy-tailed distributions, likelihood ratio test, maximum likelihood, model selection, power-law distributions},
}
@TECHREPORT{Page1999,
author = {Lawrence Page and Sergey Brin and Rajeev Motwani and
Terry Winograd},
title = {The {P}age{R}ank Citation Ranking: Bringing Order to the
Web},
institution = {Stanford Digital Library Technologies Project,
Stanford University},
number = "SIDL-WP-1999-0120",
year = {1998}
}
@INPROCEEDINGS{Brin1998,
author = {S. Brin and L. Page},
title = {The Anatomy of a Large-Scale Hypertextual Web Search Engine},
booktitle = {Seventh International World-Wide Web Conference (WWW 1998)},
year = {1998},
abstract = {In this paper, we present Google, a prototype of a large-scale search
engine which makes heavy use of the structure present in hypertext.
Google is designed to crawl and index the Web efficiently and produce
much more satisfying search results than existing systems. The prototype
with a full text and hyperlink database of at least 24 million pages
is available at http://google.stanford.edu/. To engineer a search
engine is a challenging task. Search engines index tens to hundreds
of millions of web pages involving a comparable number of distinct
terms. They answer tens of millions of queries every day. Despite
the importance of large-scale search engines on the web, very little
academic research has been done on them. Furthermore, due to rapid
advance in technology and web proliferation, creating a web search
engine today is very different from three years ago. This paper provides
an in-depth description of our large-scale web search engine -- the
first such detailed public description we know of to date. Apart
from the problems of scaling traditional search techniques to data
of this magnitude, there are new technical challenges involved with
using the additional information present in hypertext to produce
better search results. This paper addresses this question of how
to build a practical large-scale system which can exploit the additional
information present in hypertext. Also we look at the problem of
how to effectively deal with uncontrolled hypertext collections where
anyone can publish anything they want.},
dcr_pdf_hash = {5ACB78AB5B58708DCE1704D569235EF50AECC53735A8C56EA1B83A8FD29B93B},
owner = {Robert Meusel},
timestamp = {2013.12.02},
url = {http://ilpubs.stanford.edu:8090/361/}
}
@article{Broder2000,
author = "Andrei Broder and Ravi Kumar and Farzin Maghoul and Prabhakar Raghavan and Sridhar Rajagopalan and Raymie Stata and Andrew Tomkins and Janet Wiener",
title = "Graph structure in the {W}eb: experiments and models",
journal = "Computer Networks",
volume = "33",
number = "1--6",
pages = "309--320",
year = "2000",
}
@ARTICLE{Hall2012,
author = {Wendy Hall and Thanassis Tiropanis},
title = {Web evolution and Web Science},
journal = {Computer Networks },
year = {2012},
volume = {56},
pages = {3859 - 3865},
number = {18},
doi = {http://dx.doi.org/10.1016/j.comnet.2012.10.004},
file = {:C:UsersRobertWifoVLiteratureComputer-Networks-2012-Hall.pdf:PDF},
issn = {1389-1286},
keywords = {Web Science},
owner = {Robert Meusel},
timestamp = {2013.09.19},
url = {http://www.sciencedirect.com/science/article/pii/S1389128612003581}
}
@ARTICLE{Kumar1999,
author = {Ravi Kumar and Prabhakar Raghavan and Sridhar Rajagopalan and Andrew
Tomkins},
title = {Trawling the Web for emerging cyber-communities},
journal = {Computer Networks },
year = {1999},
volume = {31},
pages = {1481 - 1493},
number = {11 - 16},
doi = {http://dx.doi.org/10.1016/S1389-1286(99)00040-7},
issn = {1389-1286},
keywords = {Web mining},
url = {http://www.sciencedirect.com/science/article/pii/S1389128699000407}
}
@INPROCEEDINGS{Batagelj2003,
author = {Vladimir Batagelj and Vladimir Batagelj and Andrej Mrvar and Andrej
Mrvar},
title = {Pajek - analysis and visualization of large networks},
booktitle = {Graph Drawing Software},
year = {2003},
pages = {77--103},
publisher = {Springer},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.108.5239&rep=rep1&type=pdf}
}
@ARTICLE{Batagelj1998,
author = {Batagelj, Vladimir and Mrvar, Andrej},
title = {Pajek - Program for Large Network Analysis},
journal = {Connections},
year = {1998},
volume = {21},
pages = {47 - 57},
abstract = {Large networks, having thousands of vertices and lines, can be found
in many different areas, e. g: genealogies, flow graphs of programs,
molecule, computer networks, transportation networks, social networks,
intra/inter organisational networks... Many standard network algorithms
are very time and space consuming and therefore unsuitable for analysis
of such networks. In the article we present some approaches to analysis
and visualisation of large networks implemented in {programPajek}.
Some typical examples are also given. 1},
citeulike-article-id = {3177215},
citeulike-linkout-0 = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.27.9156},
file = {:C:UsersRobertWifoVLiteraturepajek98.pdf:PDF},
posted-at = {2008-12-16 11:25:46},
priority = {2},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.27.9156}
}
@INPROCEEDINGS{Boldi2004a,
author = {Boldi, Paolo and Vigna, Sebastiano},
title = "The {W}eb{G}raph Framework {I}: {C}ompression Techniques",
booktitle = {Proc. WWW'04},
year = {2004},
pages = {595 - 602},
organization = {ACM},
dcr_bibtex_id = {10601},
dcr_pdf_hash = {32B21A12B369C7F82E929494384E2C698F3686BC56541096EBD252635768A},
file = {:C:UsersRobertWifoVLiteratureWebGraphI.pdf:PDF},
owner = {Robert Meusel},
timestamp = {2013.09.17},
url = {http://vigna.di.unimi.it/ftp/papers/WebGraphI.pdf}
}
@INPROCEEDINGS{Bizer2013,
author = {Bizer, Christian and Eckert, Kai and Meusel, Robert and M"uhleisen,
Hannes and Schuhmacher, Michael and V"olker, Johanna},
title = {Deployment of {RDFa}, Microdata, and Microformats on the Web - A Quantitative
Analysis},
booktitle = {Proc. of the In-Use Track International Semantic Web Conference 2013},
year = {2013},
month = {Oct},
file = {:C:UsersRobertWifoVLiteratureBizer-etal-DeploymentRDFaMicrodataMicroformats-ISWC-InUse-2013.pdf:PDF},
owner = {Robert Meusel},
timestamp = {2013.09.16},
}
@ARTICLE{Serrano2007,
author = {Serrano, M and Maguitman, Ana and Bogu{~n}{'a}, Mari{'a}n and
Fortunato, Santo and Vespignani, Alessandro},
title = {Decoding the structure of the {WWW}: A comparative analysis of Web
crawls},
LONGjournal = {ACM Transactions on the Web (TWEB)},
journal = {TWEB},
year = {2007},
volume = {1},
pages = {10},
number = {2},
dcr_bibtex_id = {10485},
dcr_pdf_hash = {5B4F237F9CB80FBC19322DF96D1BB6D85201DAF5F489AC88C3429404DB923},
file = {:C:UsersRobertWifoVLiteraturea10-serrano.pdf:PDF},
owner = {Robert Meusel},
publisher = {ACM},
timestamp = {2013.09.11},
url = {http://complex.ffn.ub.es/ckfinder/userfiles/files/a10-serrano.pdf}
}
@TECHREPORT{Spiegler2013,
author = {Sebastian Spiegler},
title = {Statistics of the {Common Crawl Corpus} 2012},
institution = {SwiftKey},
year = {2013},
month = {June},
LONGnote = {Document viewed on September 16th 2013 from https://docs.google.com/file/d/1_9698uglerxB9nAglvaHkEgU-iZNm1TvVGuCW7245-WGvZq47teNpb_uL5N9/},
abstract = {The 2012 Common Crawl corpus is an excellent opportunity for individuals
or businesses to cost- effectively access a large portion of the
internet: 210 terabytes of raw data corresponding to 3.83 billion
documents or 41.4 million distinct second- level domains. Twelve
of the top-level domains have a representation of above 1% whereas
documents from .com account to more than 55% of the corpus. The corpus
contains a large amount of sites from youtube.com, blog publishing
services like blogspot.com and wordpress.com as well as online shopping
sites such as amazon.com. These sites are good sources for comments
and reviews. Almost half of all web documents are utf-8 encoded whereas
the encoding of the 43% is unknown. The corpus contains 92% HTML
documents and 2.4% PDF files. The remainder are images, XML or code
like JavaScript and cascading style sheets.},
owner = {Robert Meusel},
timestamp = {2013.09.16},
url = {https://docs.google.com/file/d/1_9698uglerxB9nAglvaHkEgU-iZNm1TvVGuCW7245-WGvZq47teNpb_uL5N9/}
}
@ARTICLE{Barabasi1999,
author = {Albert-L�szl� Barab�si and R�ka Albert},
title = {Emergence of Scaling in Random Networks},
journal = {Science},
year = {1999},
volume = {286},
pages = {209-512},
number = {5439},
month = {October},
file = {:C:UsersRobertWifoVLiterature9910332.pdf:PDF},
owner = {Robert Meusel},
timestamp = {2013.09.11},
url = {http://arxiv.org/pdf/cond-mat/9910332.pdf}
}
@INPROCEEDINGS{Boldi2002,
author = {Boldi, Paolo and Codenotti, Bruno and Santini, Massimo and Vigna,
Sebastiano},
title = {Structural properties of the {African} web},
booktitle = {Proc. WWW'02},
year = {2002},
file = {:C:UsersRobertWifoVLiteratureStructuralpropertiesoftheAfricanweb.pdf:PDF},
owner = {Robert Meusel},
timestamp = {2013.09.11},
url = {http://vigna.di.unimi.it/ftp/papers/www2002b/poster.pdf}
}
@ARTICLE{Dill2002,
author = {Dill, Stephen and Kumar, Ravi and Mccurley, Kevin S. and Rajagopalan,
Sridhar and Sivakumar, D. and Tomkins, Andrew},
title = {Self-similarity in the web},
journal = {ACM Trans. Internet Technol.},
year = {2002},
volume = {2},
pages = {205--223},
number = {3},
month = aug,
acmid = {572328},
address = {New York, NY, USA},
doi = {10.1145/572326.572328},
file = {:C:UsersRobertWifoVLiteraturep205-dill.pdf:PDF},
issn = {1533-5399},
issue_date = {August 2002},
keywords = {Fractal, Web-based services, World-Wide-Web, graph structure, online
information services, self-similarity},
numpages = {19},
owner = {Robert Meusel},
publisher = {ACM},
timestamp = {2013.09.11},
url = {http://doi.acm.org/10.1145/572326.572328}
}
@ARTICLE{Dlugolinsky2012,
author = {Dlugolinsk{`y}, {v{S}}tefan and {v{S}}eleng, Martin and Laclav{i}k,
Michal and Hluch{`y}, Ladislav},
title = {Distributed web-scale infrastructure for crawling, indexing and search
with semantic support},
journal = {Computer Science},
year = {2012},
volume = {13},
pages = {4},
dcr_bibtex_id = {10339},
dcr_pdf_hash = {C080B8235BB3A857D94C8BF99DDF0A1D5BEC4D37B931DC283488A4EBF23C},
file = {:C:UsersRobertWifoVLiteraturedistributed_web_framework.pdf:PDF},
owner = {Robert Meusel},
timestamp = {2013.09.04},
url = {http://journals.bg.agh.edu.pl/COMPUTER/2012.13.4/csci.2012.13.4.5.pdf}
}
@INPROCEEDINGS{Donato2005,
author = {Donato, Debora and Leonardi, Stefano and Millozzi, Stefano and Tsaparas,
Panayiotis},
title = {Mining the inner structure of the Web graph.},
booktitle = {WebDB},
year = {2005},
pages = {145--150},
dcr_bibtex_id = {10484},
dcr_pdf_hash = {FFF3EA1F2F404FB9806DE4E5A734D1AE686D10853CA30967C4566790A33D1},
file = {:C:UsersRobertWifoVLiteraturedonato2008mining.pdf:PDF},
owner = {Robert Meusel},
timestamp = {2013.09.11},
url = {http://www.research.yahoo.net/files/donato2008mining.pdf}
}
@INPROCEEDINGS{Gubichev2010,
author = {Gubichev, Andrey and Bedathur, Srikanta and Seufert, Stephan and
Weikum, Gerhard},
title = {Fast and accurate estimation of shortest paths in large graphs},
booktitle = {Proc. of the 19th ACM international conference on Information
and knowledge management},
year = {2010},
pages = {499--508},
organization = {ACM},
dcr_bibtex_id = {10459},
dcr_pdf_hash = {52C9153CDD3B40DA40F698EF2B6212A154EEEDCFB4FBEE1D116FCC4189EA12C},
file = {:C:UsersRobertWifoVLiteratureaspsn-cikm.pdf:PDF},
owner = {Robert Meusel},
timestamp = {2013.09.11},
}
@INCOLLECTION{Hirate2008,
author = {Hirate, Yu and Kato, Shin and Yamana, Hayato},
title = {Web structure in 2005},
booktitle = {Algorithms and models for the web-graph},
publisher = {Springer},
year = {2008},
pages = {36--46},
dcr_bibtex_id = {10486},
dcr_pdf_hash = {352EAD7D9497134A66BFAE4D777FB7FFAD7C87D5D9222975F6DDB37D25A593},
file = {:C:UsersRobertWifoVLiteratureweb2005.pdf:PDF},
owner = {Robert Meusel},
timestamp = {2013.09.11},
}
@INPROCEEDINGS{Kang2009,
author = {Kang, U and Tsourakakis, Charalampos E and Faloutsos, Christos},
title = {Pegasus: A peta-scale graph mining system implementation and observations},
booktitle = {Data Mining, 2009. ICDM'09},
year = {2009},
pages = {229--238},
organization = {IEEE},
dcr_bibtex_id = {10462},
dcr_pdf_hash = {CF1BF1A6D7F4CEA93942F301A5461273D8AA1F8881FE828FC54C8409F73E9FB},
file = {:C:UsersRobertWifoVLiteraturepegasus.pdf:PDF},
owner = {Robert Meusel},
timestamp = {2013.09.11}
}
@INPROCEEDINGS{Kyrola2012,
author = {Kyrola, Aapo and Blelloch, Guy and Guestrin, Carlos},
title = {GraphChi: Large-scale graph computation on just a PC},
booktitle = {Proc. OSDI'12},
year = {2012},
pages = {31--46},
file = {:C:UsersRobertWifoVLiteratureosdi12-final-126.pdf:PDF},
url = {https://www.usenix.org/system/files/conference/osdi12/osdi12-final-126.pdf}
}
@INPROCEEDINGS{Luczak-Rosch2013,
author = {Luczak-R"{o}sch, Markus and Tolksdorf, Robert},
title = {On the topology of the web of data},
booktitle = {Proc. of the 24th ACM Conference on Hypertext and Social Media},
year = {2013},
series = {HT '13},
pages = {253--257},
address = {New York, NY, USA},
publisher = {ACM},
acmid = {2481526},
doi = {10.1145/2481492.2481526},
file = {:C:UsersRobertWifoVLiteraturethetopology_of_the_web_of_data.pdf:PDF},
isbn = {978-1-4503-1967-6},
location = {Paris, France},
numpages = {5},
owner = {Robert Meusel},
timestamp = {2013.09.04},
url = {http://doi.acm.org/10.1145/2481492.2481526}
}
@ARTICLE{Malewicz2010,
author = {Malewicz, Grzegorz and Austern, Matthew H. and Bik, Aart J.C and
Dehnert, James C. and Horn, Ilan and Leiser, Naty and Czajkowski,
Grzegorz},
title = {Pregel: a system for large-scale graph processing},
year = {2010},
pages = {135--146},
acmid = {1807184},
address = {New York, NY, USA},
booktitle = {Proc. SIGMOD'10},
doi = {10.1145/1807167.1807184},
file = {:C:UsersRobertWifoVLiteraturepregel_paper.pdf:PDF},
isbn = {978-1-4503-0032-2},
keywords = {distributed computing, graph algorigthms},
location = {Indianapolis, Indiana, USA},
numpages = {12},
owner = {Robert Meusel},
publisher = {ACM},
series = {SIGMOD '10},
timestamp = {2013.09.11},
url = {http://doi.acm.org/10.1145/1807167.1807184}
}
@ARTICLE{Zhu2008,
author = {Zhu, Jonathan J. H. and Meng, Tao and Xie, Zhengmao and Li, Geng
and Li, Xiaoming},
title = {A teapot graph and its hierarchical structure of the {Chinese} web},
year = {2008},
pages = {1133--1134},
acmid = {1367692},
address = {New York, NY, USA},
journal = {Proc. WWW'08},
doi = {10.1145/1367497.1367692},
file = {:C:UsersRobertWifoVLiteraturep1133-Zhu.pdf:PDF},
isbn = {978-1-60558-085-2},
keywords = {bow tie graph, daisy graph, self similarity, teapot graph},
location = {Beijing, China},
numpages = {2},
owner = {Robert Meusel},
publisher = {ACM},
series = {WWW '08},
timestamp = {2013.09.11},
url = {http://doi.acm.org/10.1145/1367497.1367692}
}
@article{Fetterly2004,
author = {Fetterly, Dennis and Manasse, M and Najork, M},
keywords = {statistical properties of,web characterization,web spam},
mendeley-groups = {Web Mining,Masterthesis/unsortiert},
pages = {1--6},
journal = {Proc. WebDB'04},
title = {Spam, damn spam, and statistics: Using statistical analysis to locate spam web pages},
url = {http://dl.acm.org/citation.cfm?id=1017077},
year = {2004}
}
@article{Milgram1967,
author = {Milgram, S},
journal = {Psychology today},
number = {1},
title = {{The small world problem}},
volume = {1},
year = {1967}
}
@article{MaLHSW,
title = "Harmony in the small-world",
journal = "Physica A: Statistical Mechanics and its Applications",
volume = "285",
number = "3-4",
pages = "539 - 546",
year = "2000",
author = "Massimo Marchiori and Vito Latora",
}
@inproceedings{BoVHB,
title = "In-Core Computation of Geometric Centralities with {H}yper{B}all: A Hundred Billion Nodes and Beyond",
author = "Paolo Boldi and Sebastiano Vigna",
year = 2013,
LONGbooktitle="Proc.~of 2013 IEEE 13th International Conference on Data Mining Workshops (ICDMW 2013)",
booktitle="ICDMW 2013",
publisher="IEEE",
}
@article{ACKBTS,
author = {Achlioptas, Dimitris and Clauset, Aaron and Kempe, David and Moore, Cristopher},
journal = {Journal ACM},
title = {On the bias of traceroute sampling: {Or}, power-law degree distributions in regular graphs},
volume = "56",
number = "4",
pages = "21:1--21:28",
year = 2009
}
@Article{WADMI,
author = "Walter Willinger and David Alderson and John C. Doyle",
title = "Mathematics and the {I}nternet: {A} source of enormous
confusion and great potential",
LONGjournal = "Notices of the American Mathematical Society",
journal = "Notices of the AMS",
year = "2009",
volume = "56",
number = "5",
pages = "586--599",
}
@inproceedings{BBRFDS,
title={Four Degrees of Separation},
author={Lars Backstrom and Paolo Boldi and Marco Rosa and Johan Ugander and Sebastiano Vigna},
booktitle = "ACM Web Science 2012: Conference Proceedings",
year = "2012",
pages = "45--54",
publisher = "ACM Press",
}
@inproceedings{BoVFDSR,
author={Paolo Boldi and Sebastiano Vigna},
title={Four Degrees of Separation, Really},
LONGbooktitle={Proceedings of the 2012 International Conference on Advances in Social Networks Analysis and Mining (ASONAM 2012)},
booktitle={ASONAM 2012},
pages={1222--1227},
year={2012},
organization={IEEE Computer Society},
}
@Article{LADTTSFG,
title = "Towards a Theory of Scale-Free Graphs: Definition,
Properties, and Implications",
author = "Lun Li and David L. Alderson and John Doyle and Walter
Willinger",
journal = "Internet Math.",
year = "2005",
number = "4",
volume = "2",
}
@article{VigFB,
title = "Fibonacci Binning",
author = "Sebastiano Vigna",
journal = {CoRR},
volume = {abs/1312.3749},
year = 2013,
}
@article{BoVACIM,
title = "Axioms for Centrality",
author = "Paolo Boldi and Sebastiano Vigna",
journal = "Internet Math.",
note = "To appear",
year = {2014},
}
@Misc{RFC1738,
author = "T. Berners-Lee and L. Masinter and M. McCahill",
title = "{RFC 1738}: Uniform Resource Locators ({URL})",
year = "1994",
institution = "IETF Network Working Group",
}
Downloads
Published
Issue
Section
License
The copyright of the published articles stays with the authors.